diff mbox series

[AARCH64] Introduce aarch64 atomic_{load,store}ti patterns

Message ID VI1PR0801MB2016F12A3A02340ECA919223E0140@VI1PR0801MB2016.eurprd08.prod.outlook.com
State New
Headers show
Series [AARCH64] Introduce aarch64 atomic_{load,store}ti patterns | expand

Commit Message

Matthew Malcomson Sept. 27, 2018, 1:43 p.m. UTC
[PATCH][GCC][AARCH64] Introduce aarch64 atomic_{load,store}ti patterns

In Armv8.4-a these patterns use the LDP/STP instructions that are guaranteed to
be single-copy atomic, ensure correct memory ordering semantics by using
the DMB instruction.

We put the use of these inline expansions behind a command line flag since they
do not satisfy the libatomic ABI and hence can't be used together with code
already compiled using 16 byte atomics.
This command line flag is -matomic-128bit-instructions.

Given the introduction of a flag specified to break ABI compatibility with
libatomic, it seems reasonable to introduce the load-exclusive/store-exclusive
read-modify-write loop emulation of 128 bit atomic load and stores for older
architectures behind this flag.

We introduce the usual extension macros for the "at" extension marking the
LDP/STP atomicity guarantees introduced in Armv8.4-a and use these to decide
which to use when -matomic-128bit-instructions is provided on the command line.

Tested with full bootstrap and make check on aarch64-none-linux-gnu.
Ok for trunk?

gcc/ChangeLog:

2018-09-27  Matthew Malcomson  <matthew.malcomson@arm.com>

	* config/aarch64/aarch64-protos.h (aarch64_split_atomic_ti_access): New
	prototype.
	* config/aarch64/aarch64.c (aarch64_split_atomic_ti_access): New.
	* config/aarch64/aarch64.h (AARCH64_FL_AT): New flag.
	(AARCH64_FL_PROFILE): Flag moved to accomodate above.
	(AARCH64_FL_FOR_ARCH8_4): Include AARCH64_FL_AT.
	(AARCH64_ISA_AT): New ISA flag.
	* config/aarch64/aarch64.opt (-matomic-128bit-instruction): New.
	* config/aarch64/atomics.md (atomic_load<mode>, atomic_store<mode>,
	@aarch64_load_exclusive<mode> {smaller registers},
	@aarch64_load_exclusive<mode> {GPI registers},
	@aarch64_store_exclusive<mode>): Use aarch_mm_needs_{acquire,release}
	instead of three part check.
	(atomic_loadti, aarch64_atomic_loadti_ldp, aarch64_atomic_loadti_basic
	atomic_storeti, aarch64_atomic_storeti_stp,
	aarch64_atomic_storeti_basic) New
	* config/aarch64/iterators.md (GPI_TI): New.
	* config/aarch64/predicates.md (aarch64_atomic_TImode_operand,
	aarch64_TImode_pair_operand): New.
	* doc/invoke.texi (-matomic-128bit-instructions): Document option.

gcc/testsuite/ChangeLog:

2018-09-27  Matthew Malcomson  <matthew.malcomson@arm.com>

	* gcc.target/aarch64/atomic-load128.c: New test.
	* gcc.target/aarch64/atomic-store.x: Shared macro for below tests.
	* gcc.target/aarch64/atomic-store.c: Use atomic-store.x.
	* gcc.target/aarch64/atomic-store128.c: New test using atomic-store.x.


###############     Attachment also inlined for ease of reply    ###############
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index caf1d2041f0cac8e3f975f8384a167a90dc638e5..578ea925fac9a7237af3a53e7ec642d0ba8e7b93 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -560,6 +560,8 @@ machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx);
 rtx aarch64_gen_compare_reg (RTX_CODE, rtx, rtx);
 rtx aarch64_load_tp (rtx);
 
+void aarch64_split_atomic_ti_access (rtx op[], bool);
+
 void aarch64_expand_compare_and_swap (rtx op[]);
 void aarch64_split_compare_and_swap (rtx op[]);
 void aarch64_gen_atomic_cas (rtx, rtx, rtx, rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index e5cdb1d54f4ee96140202ea21a9478438d208f45..c1e407b5a3f27aa7eea9c35e749fe597e79f3e65 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -158,9 +158,10 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_SHA3	      (1 << 18)  /* Has ARMv8.4-a SHA3 and SHA512.  */
 #define AARCH64_FL_F16FML     (1 << 19)  /* Has ARMv8.4-a FP16 extensions.  */
 #define AARCH64_FL_RCPC8_4    (1 << 20)  /* Has ARMv8.4-a RCPC extensions.  */
+#define AARCH64_FL_AT         (1 << 21)  /* Has ARMv8.4-a AT extensions.  */
 
 /* Statistical Profiling extensions.  */
-#define AARCH64_FL_PROFILE    (1 << 21)
+#define AARCH64_FL_PROFILE    (1 << 22)
 
 /* Has FP and SIMD.  */
 #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
@@ -179,7 +180,7 @@ extern unsigned aarch64_architecture_version;
   (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3)
 #define AARCH64_FL_FOR_ARCH8_4			\
   (AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4 | AARCH64_FL_F16FML \
-   | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4)
+   | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4 | AARCH64_FL_AT)
 
 /* Macros to test ISA flags.  */
 
@@ -201,6 +202,7 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_ISA_SHA3	   (aarch64_isa_flags & AARCH64_FL_SHA3)
 #define AARCH64_ISA_F16FML	   (aarch64_isa_flags & AARCH64_FL_F16FML)
 #define AARCH64_ISA_RCPC8_4	   (aarch64_isa_flags & AARCH64_FL_RCPC8_4)
+#define AARCH64_ISA_AT	           (aarch64_isa_flags & AARCH64_FL_AT)
 
 /* Crypto is an optional extension to AdvSIMD.  */
 #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index cbf9d0c09b23712a67a5f0781c247cc859ade18d..7b46ca38a8cf55c6359e2f577bb9e15363dd3132 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14160,6 +14160,80 @@ aarch64_emit_post_barrier (enum memmodel model)
     }
 }
 
+/* Emit an emulation of an atomic access for TImode using a load-exclusive
+   store-exclusive pair.  */
+void
+aarch64_split_atomic_ti_access (rtx operands[], bool loading)
+{
+  rtx dest, src, model_rtx, scratch;
+  dest = operands[0];
+  src = operands[1];
+  model_rtx = operands[2];
+  scratch = operands[3];
+
+  machine_mode mode = GET_MODE (src);
+  gcc_assert (mode == TImode || (mode == VOIDmode && src == const0_rtx));
+
+  rtx_code_label *label = gen_label_rtx ();
+  emit_label (label);
+
+  rtx scratch_flag;
+  /* In the below we use the definition that the ordering of sequentially
+     consistent memory ordering semantics on a load are the same as load-acquire
+     semantics, and similarly on a store the ordering semantics make the same
+     requirements as store-release semantics.
+
+     Sequentially consistent does provide extra semantics to do with a total
+     ordering of atomic modifications of memory with sequential consistent
+     semantics.  That memory ordering requirement is already provided by the
+     fact that the Armv8 memory model is other-multi-copy atomic (page B2-96 of
+     the ARM Architecture Reference Manual issue C.a) in combination with the
+     load-acquire/store-release semantics.
+
+     Given that the aim of this instruction is to behave as an
+     atomic_{load,store}ti these observations demonstrate that we do not need to
+     provide any special handling for sequentially consistent memory ordering
+     over and above the handling for load-acquire and store-release
+     semantics.  */
+  if (loading)
+    {
+      /* For load-acquire semantics we require that no reads or writes can be
+	 reordered to before the observed load.  Hence all we need is for that
+	 load to have the required memory ordering semantics.  */
+      scratch_flag = scratch;
+      emit_insn (gen_aarch64_load_exclusive (TImode, dest, src, model_rtx));
+      emit_insn (gen_aarch64_store_exclusive (TImode, scratch_flag, src, dest,
+					      GEN_INT (MEMMODEL_RELAXED)));
+    }
+  else
+    {
+      /* For store-release semantics we require that no memory access is
+	 reordered to after the store-exclusive that is observed.  This is
+	 satisfied by having that store-exclusive instruction execute with
+	 store-release memory semantics.  */
+      emit_insn (gen_aarch64_load_exclusive (TImode, scratch, dest,
+					     GEN_INT (MEMMODEL_RELAXED)));
+      scratch_flag = gen_lowpart (SImode, scratch);
+      emit_insn (gen_aarch64_store_exclusive (TImode, scratch_flag,
+					      dest, src, model_rtx));
+    }
+
+  rtx x;
+  if (aarch64_track_speculation)
+    {
+      /* Emit an explicit compare instruction, so that we can correctly
+	 track the condition codes.  */
+      rtx cc_reg = aarch64_gen_compare_reg (NE, scratch_flag, const0_rtx);
+      x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
+    }
+  else
+    x = gen_rtx_NE (VOIDmode, scratch_flag, const0_rtx);
+
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+			    gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
+  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+}
+
 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
    for the data in memory.  EXPECTED is the value expected to be in memory.
    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index b2e80cbf6f1f9727c4309874b1122f975fb6b9be..3b1769ba67b6e94cc7c05e051902a8a0f2cfcbe0 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -218,3 +218,9 @@ Enables verbose cost model dumping in the debug dump files.
 mtrack-speculation
 Target Var(aarch64_track_speculation)
 Generate code to track when the CPU might be speculating incorrectly.
+
+matomic-128bit-instructions
+Target Var(aarch64_handle_128bit_atomics) Init(false)
+Use architecture atomic operations to handle 128 bit atomic store/load instead
+of using libatomic.  The use of 128 bit atomics in code compiled with this
+option is ABI incompatible with that of code compiled without this option.
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index bba8e9e9c8e61d95fcfb61e650e7e76671c8f996..b47abcf7250aa2045ab8ced52a1373f6d4d71047 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -472,11 +472,66 @@
       UNSPECV_LDA))]
   ""
   {
-    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
-    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
-      return "ldr<atomic_sfx>\t%<w>0, %1";
-    else
+    if (aarch_mm_needs_acquire (operands[2]))
       return "ldar<atomic_sfx>\t%<w>0, %1";
+    else
+      return "ldr<atomic_sfx>\t%<w>0, %1";
+  }
+)
+
+(define_expand "atomic_loadti"
+ [(match_operand:TI 0 "register_operand" "")
+  (match_operand:TI 1 "aarch64_atomic_TImode_operand" "")
+  (match_operand:TI 2 "const_int_operand" "")]
+ "aarch64_handle_128bit_atomics"
+ {
+    if (AARCH64_ISA_AT)
+      {
+	emit_insn (gen_aarch64_atomic_loadti_ldp (operands[0], operands[1],
+						  operands[2]));
+	DONE;
+      }
+
+    emit_insn (gen_aarch64_atomic_loadti_loop (operands[0], operands[1],
+						operands[2]));
+    DONE;
+ }
+)
+
+(define_insn "aarch64_atomic_loadti_ldp"
+  [(set (match_operand:TI 0 "register_operand" "=r")
+    (unspec_volatile:TI
+      [(match_operand:TI 1 "aarch64_TImode_pair_operand" "Umn")
+       (match_operand:SI 2 "const_int_operand")]			;; model
+      UNSPECV_LDA))]
+  "aarch64_handle_128bit_atomics && AARCH64_ISA_AT"
+  {
+    output_asm_insn ("ldp\\t%0, %H0, %1", operands);
+    return aarch_mm_needs_acquire (operands[2])
+	  ? "dmb\\tishld"
+	  : "";
+  }
+  [(set (attr "length")
+	(if_then_else (match_test "aarch_mm_needs_acquire (operands[2])")
+		      (const_int 8)
+		      (const_int 4)))]
+)
+
+(define_insn_and_split "aarch64_atomic_loadti_loop"
+  [(set (match_operand:TI 0 "register_operand" "=&r")
+    (unspec_volatile:TI
+      [(match_operand:TI 1 "aarch64_sync_memory_operand" "Q")
+       (match_operand:SI 2 "const_int_operand")]			;; model
+      UNSPECV_LDA))
+   (clobber (reg:CC CC_REGNUM))
+   (clobber (match_scratch:SI 3 "=&r"))]
+  "aarch64_handle_128bit_atomics && !AARCH64_ISA_AT"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  {
+    aarch64_split_atomic_ti_access (operands, true);
+    DONE;
   }
 )
 
@@ -488,8 +543,7 @@
       UNSPECV_STL))]
   ""
   {
-    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
-    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
+    if (! aarch_mm_needs_release (operands[2]))
       return "str<atomic_sfx>\t%<w>1, %0";
     else if (which_alternative == 0)
       return "stlr<atomic_sfx>\t%<w>1, %0";
@@ -499,6 +553,61 @@
   [(set_attr "arch" "*,rcpc8_4")]
 )
 
+(define_expand "atomic_storeti"
+ [(match_operand:TI 0 "aarch64_atomic_TImode_operand" "")
+  (match_operand:TI 1 "aarch64_reg_or_zero" "")
+  (match_operand:TI 2 "const_int_operand" "")]
+ "aarch64_handle_128bit_atomics"
+ {
+    if (AARCH64_ISA_AT)
+      {
+	emit_insn (gen_aarch64_atomic_storeti_stp (operands[0], operands[1],
+						   operands[2]));
+	DONE;
+      }
+
+    emit_insn (gen_aarch64_atomic_storeti_loop (operands[0], operands[1],
+						 operands[2]));
+    DONE;
+ }
+)
+
+(define_insn "aarch64_atomic_storeti_stp"
+  [(set (match_operand:TI 0 "aarch64_TImode_pair_operand" "=Umn")
+    (unspec_volatile:TI
+      [(match_operand:TI 1 "aarch64_reg_or_zero" "rZ")
+       (match_operand:SI 2 "const_int_operand")]			;; model
+      UNSPECV_STL)) ]
+  "aarch64_handle_128bit_atomics && AARCH64_ISA_AT"
+  {
+    if (aarch_mm_needs_release (operands[2]))
+      output_asm_insn ("dmb\tish", operands);
+    return "stp\t%x1, %H1, %0";
+  }
+  [(set (attr "length")
+	(if_then_else (match_test "aarch_mm_needs_release (operands[2])")
+		      (const_int 8)
+		      (const_int 4)))]
+)
+
+(define_insn_and_split "aarch64_atomic_storeti_loop"
+  [(set (match_operand:TI 0 "aarch64_sync_memory_operand" "=Q")
+    (unspec_volatile:TI
+      [(match_operand:TI 1 "aarch64_reg_or_zero" "rZ")
+       (match_operand:SI 2 "const_int_operand")]			;; model
+      UNSPECV_STL))
+   (clobber (reg:CC CC_REGNUM))
+   (clobber (match_scratch:TI 3 "=&r"))]
+  "aarch64_handle_128bit_atomics && !AARCH64_ISA_AT"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  {
+    aarch64_split_atomic_ti_access (operands, false);
+    DONE;
+  }
+)
+
 (define_insn "@aarch64_load_exclusive<mode>"
   [(set (match_operand:SI 0 "register_operand" "=r")
     (zero_extend:SI
@@ -508,45 +617,52 @@
 	UNSPECV_LX)))]
   ""
   {
-    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
-    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
-      return "ldxr<atomic_sfx>\t%w0, %1";
-    else
+    if (aarch_mm_needs_acquire (operands[2]))
       return "ldaxr<atomic_sfx>\t%w0, %1";
+    else
+      return "ldxr<atomic_sfx>\t%w0, %1";
   }
 )
 
 (define_insn "@aarch64_load_exclusive<mode>"
-  [(set (match_operand:GPI 0 "register_operand" "=r")
-    (unspec_volatile:GPI
-      [(match_operand:GPI 1 "aarch64_sync_memory_operand" "Q")
+  [(set (match_operand:GPI_TI 0 "register_operand" "=r")
+    (unspec_volatile:GPI_TI
+      [(match_operand:GPI_TI 1 "aarch64_sync_memory_operand" "Q")
        (match_operand:SI 2 "const_int_operand")]
       UNSPECV_LX))]
   ""
   {
-    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
-    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
-      return "ldxr\t%<w>0, %1";
+    bool acquire_needed = aarch_mm_needs_acquire (operands[2]);
+    if (GET_MODE (operands[1]) == TImode)
+      return acquire_needed
+	     ? "ldaxp\t%0, %H0, %1"
+	     : "ldxp\t%0, %H0, %1";
     else
-      return "ldaxr\t%<w>0, %1";
+      return acquire_needed
+	     ? "ldaxr\t%<w>0, %1"
+	     : "ldxr\t%<w>0, %1";
   }
 )
 
 (define_insn "@aarch64_store_exclusive<mode>"
   [(set (match_operand:SI 0 "register_operand" "=&r")
     (unspec_volatile:SI [(const_int 0)] UNSPECV_SX))
-   (set (match_operand:ALLI 1 "aarch64_sync_memory_operand" "=Q")
-    (unspec_volatile:ALLI
-      [(match_operand:ALLI 2 "aarch64_reg_or_zero" "rZ")
+   (set (match_operand:ALLI_TI 1 "aarch64_sync_memory_operand" "=Q")
+    (unspec_volatile:ALLI_TI
+      [(match_operand:ALLI_TI 2 "aarch64_reg_or_zero" "rZ")
        (match_operand:SI 3 "const_int_operand")]
       UNSPECV_SX))]
   ""
   {
-    enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
-    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
-      return "stxr<atomic_sfx>\t%w0, %<w>2, %1";
+    bool release_needed = aarch_mm_needs_release (operands[3]);
+    if (GET_MODE (operands[1]) == TImode)
+      return release_needed
+	     ? "stlxp\t%w0, %x2, %H2, %1"
+	     : "stxp\t%w0, %x2, %H2, %1";
     else
-      return "stlxr<atomic_sfx>\t%w0, %<w>2, %1";
+      return release_needed
+	     ? "stlxr<atomic_sfx>\t%w0, %<w>2, %1"
+	     : "stxr<atomic_sfx>\t%w0, %<w>2, %1";
   }
 )
 
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index a43956054e82aaf651fb45d0ff254b248c02c644..8d3fe29f6e4b9a3a7a6c8fc32c1564ef88501fb4 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -26,6 +26,9 @@
 ;; Iterator for General Purpose Integer registers (32- and 64-bit modes)
 (define_mode_iterator GPI [SI DI])
 
+;; Iterator for SI, DI, TI.
+(define_mode_iterator GPI_TI [SI DI TI])
+
 ;; Iterator for HI, SI, DI, some instructions can only work on these modes.
 (define_mode_iterator GPI_I16 [(HI "AARCH64_ISA_F16") SI DI])
 
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 5b08b03c5868c7aa86f8844e3219a6e82717d4f0..a52b8989fe277d2ec3d32ee31d99708cb8ef2ced 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -389,6 +389,23 @@
     (match_operand 0 "aarch64_9bit_offset_memory_operand")
     (match_operand 0 "aarch64_sync_memory_operand")))
 
+;; Predicate to accept operands for TImode atomic load/store.
+;; Depends on the ISA because the AT extension makes LDP/STP atomic, and they
+;; accept more operands than LDAXP/STLXP.
+(define_predicate "aarch64_TImode_pair_operand"
+  (and (match_code "mem")
+	(ior (match_code "reg" "0")
+	     (and (match_code "plus" "0")
+		  (match_code "reg" "00")
+		  (match_code "const_int" "01")
+		  (match_test "aarch64_offset_7bit_signed_scaled_p (
+				DImode, INTVAL (XEXP (XEXP (op, 0), 1)))")))))
+
+(define_predicate "aarch64_atomic_TImode_operand"
+  (if_then_else (match_test "AARCH64_ISA_AT")
+    (match_operand 0 "aarch64_TImode_pair_operand")
+    (match_operand 0 "aarch64_sync_memory_operand")))
+
 ;; Predicates for parallel expanders based on mode.
 (define_special_predicate "vect_par_cnst_hi_half"
   (match_code "parallel")
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 685c211e176d13809078cf0fd595d49763adef25..d1e5cdfcdfb8e00400d422ed6869d60ebe4f03fb 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -15075,6 +15075,26 @@ and 2048.  @samp{scalable} is the default.
 At present, @samp{-msve-vector-bits=128} produces the same output
 as @samp{-msve-vector-bits=scalable}.
 
+@item -matomic-128bit-instructions
+@itemx -mno-atomic-128bit-instructions
+@opindex matomic-128bit-instructions
+@opindex mno-atomic-128bit-instructions
+Enable or disable using inline 128 bit atomic loads and stores.
+Without this flag atomic memory accesses of this size will be handled by
+libatomic.
+Inline accesses are faster than calls to libatomic but can interrupt accesses
+through libatomic, this means that pre-existing code using libatomic is ABI
+incompatible with code generated using this flag.
+By default this option is disabled @samp{-mno-atomic-128bit-instructions}.
+
+If this flag is used targeting a processor that has the atomicity guarantees on
+the STP and LDP instructions added in Armv8.4 then GCC will use these
+instructions, otherwise GCC will generate a load-exclusive/store-exclusive
+read-write-modify loop.
+The use of a read-write-modify loop for an atomic load can cause a Segmentation
+fault when atomically loading a variable that the compiler has put in read-only
+memory.
+
 @end table
 
 @subsubsection @option{-march} and @option{-mcpu} Feature Modifiers
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-load128.c b/gcc/testsuite/gcc.target/aarch64/atomic-load128.c
new file mode 100644
index 0000000000000000000000000000000000000000..b43599975db69201771adc6695d67da052be75a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-load128.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.4-a -O2 -matomic-128bit-instructions" } */
+
+#include <stdlib.h>
+#include <stdatomic.h>
+#include <stdint.h>
+
+#define RUN_TESTS_NAMED(prefix) \
+  void \
+  prefix##128 () \
+{ \
+  __int128 *atomic_vals = calloc (4, sizeof (__int128)); \
+  __int128 temp_val; \
+  temp_val = atomic_load_explicit (atomic_vals, memory_order_relaxed); \
+  temp_val = atomic_load_explicit (atomic_vals, memory_order_acquire); \
+  temp_val = atomic_load_explicit ((atomic_vals + 1), memory_order_acquire); \
+  temp_val = atomic_load ((atomic_vals + 2)); \
+  temp_val = atomic_load_explicit ((atomic_vals + 3), memory_order_relaxed); \
+}
+
+RUN_TESTS_NAMED (bar);
+/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]+, \\\[x\[^\n\]*\n\[ \t\]*dmb\tishld" 3 } } */
+/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 5 } } */
+
+__attribute__ ((target ("arch=armv8.3-a")))
+RUN_TESTS_NAMED (foo);
+/* { dg-final { scan-assembler-times "ldxp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 2 } } */
+/* { dg-final { scan-assembler-times "ldaxp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
index 8cabc05b0d739dbfdcecf681348b62634fcfc9a4..141e4e317d73b12555163c8352218842d4250a37 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-store.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
@@ -1,23 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-march=armv8.4-a -O2" } */
 
-#include <stdatomic.h>
-
-typedef __INT8_TYPE__ int8_t;
-typedef __INT16_TYPE__ int16_t;
-typedef __INT32_TYPE__ int32_t;
-typedef __INT64_TYPE__ int64_t;
-
-#define STORE_TESTS(size) \
-  void \
-  foo##size (int##size##_t *atomic_vals) \
-{ \
-  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
-  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
-  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
-  atomic_store ((atomic_vals + 2), 2); \
-  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
-}
+#include "atomic-store.x"
 
 STORE_TESTS (8);
 /* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
@@ -26,6 +10,7 @@ STORE_TESTS (8);
 /* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */
 /* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
 /* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurb\twzr, \\\[x\[0-9\]+, 4\\\]" 1 } } */
 
 STORE_TESTS (16);
 /* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
@@ -33,6 +18,7 @@ STORE_TESTS (16);
 /* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
 /* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
 /* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\twzr, \\\[x\[0-9\]+, 8\\\]" 1 } } */
 
 STORE_TESTS (32);
 /* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
@@ -40,12 +26,14 @@ STORE_TESTS (32);
 /* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
 /* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
 /* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\twzr, \\\[x\[0-9\]+, 16\\\]" 1 } } */
 
 STORE_TESTS (64);
 /* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
 /* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
 /* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */
 /* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\txzr, \\\[x\[0-9\]+, 32\\\]" 1 } } */
 
 void
 foo_toolarge_offset (int64_t *atomic_vals)
@@ -64,12 +52,20 @@ foo_negative (int8_t *atomic_vals)
 }
 /* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 { target { ! ilp32 } } } } */
 
-#pragma GCC target ("arch=armv8.3-a")
 void
+__attribute__ ((target ("arch=armv8.3-a")))
 foo_older_arch (int64_t *atomic_vals)
 {
   atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
 }
-
 /* Three times, one for each of the three above functions.  */
 /* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 } } */
+
+/* This test is to show that the -matomic-128bit-instructions flag is needed
+ * to handle 128 bit atomic store.  */
+typedef __int128 int128_t;
+STORE_TESTS (128);
+/* { dg-final { scan-assembler-not "dmb\tish\n\[ \t\]*stp" } } */
+/* { dg-final { scan-assembler-not "stxp" } } */
+/* { dg-final { scan-assembler-not "stlxp" } } */
+/* { dg-final { scan-assembler-times "bl?\t__atomic_store_16" 6 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.x b/gcc/testsuite/gcc.target/aarch64/atomic-store.x
new file mode 100644
index 0000000000000000000000000000000000000000..5e6261a8d3ec3905b4a850cd33dbd1caa37a186e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.x
@@ -0,0 +1,20 @@
+#include <stdatomic.h>
+
+typedef __INT8_TYPE__ int8_t;
+typedef __INT16_TYPE__ int16_t;
+typedef __INT32_TYPE__ int32_t;
+typedef __INT64_TYPE__ int64_t;
+
+#define STORE_TESTS_NAMED(size, prefix) \
+void \
+prefix##size (int##size##_t *atomic_vals) \
+{ \
+  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
+  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
+  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
+  atomic_store ((atomic_vals + 2), 2); \
+  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
+  atomic_store_explicit (atomic_vals + 4, 0, memory_order_release); \
+}
+
+#define STORE_TESTS(size) STORE_TESTS_NAMED(size, foo)
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store128.c b/gcc/testsuite/gcc.target/aarch64/atomic-store128.c
new file mode 100644
index 0000000000000000000000000000000000000000..bed864205128e2d8b6deb856d061ad13667cb14b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-store128.c
@@ -0,0 +1,74 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.4-a -O2 -matomic-128bit-instructions" } */
+
+#include "atomic-store.x"
+
+STORE_TESTS (8);
+/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1  { target { ! ilp32 } } } } */
+/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 2  { target { ilp32 } } } } */
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurb\twzr, \\\[x\[0-9\]+, 4\\\]" 1 } } */
+
+STORE_TESTS (16);
+/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\twzr, \\\[x\[0-9\]+, 8\\\]" 1 } } */
+
+STORE_TESTS (32);
+/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\twzr, \\\[x\[0-9\]+, 16\\\]" 1 } } */
+
+STORE_TESTS (64);
+/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\txzr, \\\[x\[0-9\]+, 32\\\]" 1 } } */
+
+void
+foo_toolarge_offset (int64_t *atomic_vals)
+{
+  /* 9bit signed unscaled immediate =>
+	largest representable value +255.
+	smallest representable value -256.  */
+  atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
+  atomic_store_explicit (atomic_vals - 33, 2, memory_order_release);
+}
+
+void
+foo_negative (int8_t *atomic_vals)
+{
+  atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);
+}
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 { target { ! ilp32 } } } } */
+
+void
+__attribute__ ((target ("arch=armv8.3-a")))
+foo_older_arch (int64_t *atomic_vals)
+{
+  atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
+}
+/* Three times, one for each of the three above functions.  */
+/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 } } */
+typedef __int128 int128_t;
+STORE_TESTS (128);
+/* { dg-final { scan-assembler-times "dmb\tish\n\[ \t\]*stp" 4 } } */
+/* { dg-final { scan-assembler-times "stp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 5 } } */
+/* { dg-final { scan-assembler-times "stp\txzr, xzr, \\\[x" 1 } } */
+
+
+__attribute__ ((target ("arch=armv8.3-a")))
+STORE_TESTS_NAMED (128, bar);
+/* { dg-final { scan-assembler-times "stxp\tw\[0-9\]+, x\[0-9\]+, x\[0-9\]+, \\\[x" 2 } } */
+/* { dg-final { scan-assembler-times "stlxp\tw\[0-9\]+, x\[0-9\]+, x\[0-9\]+, \\\[x" 3 } } */
+/* { dg-final { scan-assembler-times "stlxp\tw\[0-9\]+, xzr, xzr, \\\[x" 1 } } */

Comments

Matthew Malcomson Oct. 15, 2018, 2:25 p.m. UTC | #1
ping


On 27/09/18 14:43, Matthew Malcomson wrote:
> [PATCH][GCC][AARCH64] Introduce aarch64 atomic_{load,store}ti patterns
>
> In Armv8.4-a these patterns use the LDP/STP instructions that are guaranteed to
> be single-copy atomic, ensure correct memory ordering semantics by using
> the DMB instruction.
>
> We put the use of these inline expansions behind a command line flag since they
> do not satisfy the libatomic ABI and hence can't be used together with code
> already compiled using 16 byte atomics.
> This command line flag is -matomic-128bit-instructions.
>
> Given the introduction of a flag specified to break ABI compatibility with
> libatomic, it seems reasonable to introduce the load-exclusive/store-exclusive
> read-modify-write loop emulation of 128 bit atomic load and stores for older
> architectures behind this flag.
>
> We introduce the usual extension macros for the "at" extension marking the
> LDP/STP atomicity guarantees introduced in Armv8.4-a and use these to decide
> which to use when -matomic-128bit-instructions is provided on the command line.
>
> Tested with full bootstrap and make check on aarch64-none-linux-gnu.
> Ok for trunk?
>
> gcc/ChangeLog:
>
> 2018-09-27  Matthew Malcomson  <matthew.malcomson@arm.com>
>
> 	* config/aarch64/aarch64-protos.h (aarch64_split_atomic_ti_access): New
> 	prototype.
> 	* config/aarch64/aarch64.c (aarch64_split_atomic_ti_access): New.
> 	* config/aarch64/aarch64.h (AARCH64_FL_AT): New flag.
> 	(AARCH64_FL_PROFILE): Flag moved to accomodate above.
> 	(AARCH64_FL_FOR_ARCH8_4): Include AARCH64_FL_AT.
> 	(AARCH64_ISA_AT): New ISA flag.
> 	* config/aarch64/aarch64.opt (-matomic-128bit-instruction): New.
> 	* config/aarch64/atomics.md (atomic_load<mode>, atomic_store<mode>,
> 	@aarch64_load_exclusive<mode> {smaller registers},
> 	@aarch64_load_exclusive<mode> {GPI registers},
> 	@aarch64_store_exclusive<mode>): Use aarch_mm_needs_{acquire,release}
> 	instead of three part check.
> 	(atomic_loadti, aarch64_atomic_loadti_ldp, aarch64_atomic_loadti_basic
> 	atomic_storeti, aarch64_atomic_storeti_stp,
> 	aarch64_atomic_storeti_basic) New
> 	* config/aarch64/iterators.md (GPI_TI): New.
> 	* config/aarch64/predicates.md (aarch64_atomic_TImode_operand,
> 	aarch64_TImode_pair_operand): New.
> 	* doc/invoke.texi (-matomic-128bit-instructions): Document option.
>
> gcc/testsuite/ChangeLog:
>
> 2018-09-27  Matthew Malcomson  <matthew.malcomson@arm.com>
>
> 	* gcc.target/aarch64/atomic-load128.c: New test.
> 	* gcc.target/aarch64/atomic-store.x: Shared macro for below tests.
> 	* gcc.target/aarch64/atomic-store.c: Use atomic-store.x.
> 	* gcc.target/aarch64/atomic-store128.c: New test using atomic-store.x.
>
>
> ###############     Attachment also inlined for ease of reply    ###############
>
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index caf1d2041f0cac8e3f975f8384a167a90dc638e5..578ea925fac9a7237af3a53e7ec642d0ba8e7b93 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -560,6 +560,8 @@ machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx);
>   rtx aarch64_gen_compare_reg (RTX_CODE, rtx, rtx);
>   rtx aarch64_load_tp (rtx);
>   
> +void aarch64_split_atomic_ti_access (rtx op[], bool);
> +
>   void aarch64_expand_compare_and_swap (rtx op[]);
>   void aarch64_split_compare_and_swap (rtx op[]);
>   void aarch64_gen_atomic_cas (rtx, rtx, rtx, rtx, rtx);
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index e5cdb1d54f4ee96140202ea21a9478438d208f45..c1e407b5a3f27aa7eea9c35e749fe597e79f3e65 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -158,9 +158,10 @@ extern unsigned aarch64_architecture_version;
>   #define AARCH64_FL_SHA3	      (1 << 18)  /* Has ARMv8.4-a SHA3 and SHA512.  */
>   #define AARCH64_FL_F16FML     (1 << 19)  /* Has ARMv8.4-a FP16 extensions.  */
>   #define AARCH64_FL_RCPC8_4    (1 << 20)  /* Has ARMv8.4-a RCPC extensions.  */
> +#define AARCH64_FL_AT         (1 << 21)  /* Has ARMv8.4-a AT extensions.  */
>   
>   /* Statistical Profiling extensions.  */
> -#define AARCH64_FL_PROFILE    (1 << 21)
> +#define AARCH64_FL_PROFILE    (1 << 22)
>   
>   /* Has FP and SIMD.  */
>   #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
> @@ -179,7 +180,7 @@ extern unsigned aarch64_architecture_version;
>     (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3)
>   #define AARCH64_FL_FOR_ARCH8_4			\
>     (AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4 | AARCH64_FL_F16FML \
> -   | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4)
> +   | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4 | AARCH64_FL_AT)
>   
>   /* Macros to test ISA flags.  */
>   
> @@ -201,6 +202,7 @@ extern unsigned aarch64_architecture_version;
>   #define AARCH64_ISA_SHA3	   (aarch64_isa_flags & AARCH64_FL_SHA3)
>   #define AARCH64_ISA_F16FML	   (aarch64_isa_flags & AARCH64_FL_F16FML)
>   #define AARCH64_ISA_RCPC8_4	   (aarch64_isa_flags & AARCH64_FL_RCPC8_4)
> +#define AARCH64_ISA_AT	           (aarch64_isa_flags & AARCH64_FL_AT)
>   
>   /* Crypto is an optional extension to AdvSIMD.  */
>   #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index cbf9d0c09b23712a67a5f0781c247cc859ade18d..7b46ca38a8cf55c6359e2f577bb9e15363dd3132 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -14160,6 +14160,80 @@ aarch64_emit_post_barrier (enum memmodel model)
>       }
>   }
>   
> +/* Emit an emulation of an atomic access for TImode using a load-exclusive
> +   store-exclusive pair.  */
> +void
> +aarch64_split_atomic_ti_access (rtx operands[], bool loading)
> +{
> +  rtx dest, src, model_rtx, scratch;
> +  dest = operands[0];
> +  src = operands[1];
> +  model_rtx = operands[2];
> +  scratch = operands[3];
> +
> +  machine_mode mode = GET_MODE (src);
> +  gcc_assert (mode == TImode || (mode == VOIDmode && src == const0_rtx));
> +
> +  rtx_code_label *label = gen_label_rtx ();
> +  emit_label (label);
> +
> +  rtx scratch_flag;
> +  /* In the below we use the definition that the ordering of sequentially
> +     consistent memory ordering semantics on a load are the same as load-acquire
> +     semantics, and similarly on a store the ordering semantics make the same
> +     requirements as store-release semantics.
> +
> +     Sequentially consistent does provide extra semantics to do with a total
> +     ordering of atomic modifications of memory with sequential consistent
> +     semantics.  That memory ordering requirement is already provided by the
> +     fact that the Armv8 memory model is other-multi-copy atomic (page B2-96 of
> +     the ARM Architecture Reference Manual issue C.a) in combination with the
> +     load-acquire/store-release semantics.
> +
> +     Given that the aim of this instruction is to behave as an
> +     atomic_{load,store}ti these observations demonstrate that we do not need to
> +     provide any special handling for sequentially consistent memory ordering
> +     over and above the handling for load-acquire and store-release
> +     semantics.  */
> +  if (loading)
> +    {
> +      /* For load-acquire semantics we require that no reads or writes can be
> +	 reordered to before the observed load.  Hence all we need is for that
> +	 load to have the required memory ordering semantics.  */
> +      scratch_flag = scratch;
> +      emit_insn (gen_aarch64_load_exclusive (TImode, dest, src, model_rtx));
> +      emit_insn (gen_aarch64_store_exclusive (TImode, scratch_flag, src, dest,
> +					      GEN_INT (MEMMODEL_RELAXED)));
> +    }
> +  else
> +    {
> +      /* For store-release semantics we require that no memory access is
> +	 reordered to after the store-exclusive that is observed.  This is
> +	 satisfied by having that store-exclusive instruction execute with
> +	 store-release memory semantics.  */
> +      emit_insn (gen_aarch64_load_exclusive (TImode, scratch, dest,
> +					     GEN_INT (MEMMODEL_RELAXED)));
> +      scratch_flag = gen_lowpart (SImode, scratch);
> +      emit_insn (gen_aarch64_store_exclusive (TImode, scratch_flag,
> +					      dest, src, model_rtx));
> +    }
> +
> +  rtx x;
> +  if (aarch64_track_speculation)
> +    {
> +      /* Emit an explicit compare instruction, so that we can correctly
> +	 track the condition codes.  */
> +      rtx cc_reg = aarch64_gen_compare_reg (NE, scratch_flag, const0_rtx);
> +      x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
> +    }
> +  else
> +    x = gen_rtx_NE (VOIDmode, scratch_flag, const0_rtx);
> +
> +  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
> +			    gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
> +  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
> +}
> +
>   /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
>      for the data in memory.  EXPECTED is the value expected to be in memory.
>      DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
> diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
> index b2e80cbf6f1f9727c4309874b1122f975fb6b9be..3b1769ba67b6e94cc7c05e051902a8a0f2cfcbe0 100644
> --- a/gcc/config/aarch64/aarch64.opt
> +++ b/gcc/config/aarch64/aarch64.opt
> @@ -218,3 +218,9 @@ Enables verbose cost model dumping in the debug dump files.
>   mtrack-speculation
>   Target Var(aarch64_track_speculation)
>   Generate code to track when the CPU might be speculating incorrectly.
> +
> +matomic-128bit-instructions
> +Target Var(aarch64_handle_128bit_atomics) Init(false)
> +Use architecture atomic operations to handle 128 bit atomic store/load instead
> +of using libatomic.  The use of 128 bit atomics in code compiled with this
> +option is ABI incompatible with that of code compiled without this option.
> diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
> index bba8e9e9c8e61d95fcfb61e650e7e76671c8f996..b47abcf7250aa2045ab8ced52a1373f6d4d71047 100644
> --- a/gcc/config/aarch64/atomics.md
> +++ b/gcc/config/aarch64/atomics.md
> @@ -472,11 +472,66 @@
>         UNSPECV_LDA))]
>     ""
>     {
> -    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
> -    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
> -      return "ldr<atomic_sfx>\t%<w>0, %1";
> -    else
> +    if (aarch_mm_needs_acquire (operands[2]))
>         return "ldar<atomic_sfx>\t%<w>0, %1";
> +    else
> +      return "ldr<atomic_sfx>\t%<w>0, %1";
> +  }
> +)
> +
> +(define_expand "atomic_loadti"
> + [(match_operand:TI 0 "register_operand" "")
> +  (match_operand:TI 1 "aarch64_atomic_TImode_operand" "")
> +  (match_operand:TI 2 "const_int_operand" "")]
> + "aarch64_handle_128bit_atomics"
> + {
> +    if (AARCH64_ISA_AT)
> +      {
> +	emit_insn (gen_aarch64_atomic_loadti_ldp (operands[0], operands[1],
> +						  operands[2]));
> +	DONE;
> +      }
> +
> +    emit_insn (gen_aarch64_atomic_loadti_loop (operands[0], operands[1],
> +						operands[2]));
> +    DONE;
> + }
> +)
> +
> +(define_insn "aarch64_atomic_loadti_ldp"
> +  [(set (match_operand:TI 0 "register_operand" "=r")
> +    (unspec_volatile:TI
> +      [(match_operand:TI 1 "aarch64_TImode_pair_operand" "Umn")
> +       (match_operand:SI 2 "const_int_operand")]			;; model
> +      UNSPECV_LDA))]
> +  "aarch64_handle_128bit_atomics && AARCH64_ISA_AT"
> +  {
> +    output_asm_insn ("ldp\\t%0, %H0, %1", operands);
> +    return aarch_mm_needs_acquire (operands[2])
> +	  ? "dmb\\tishld"
> +	  : "";
> +  }
> +  [(set (attr "length")
> +	(if_then_else (match_test "aarch_mm_needs_acquire (operands[2])")
> +		      (const_int 8)
> +		      (const_int 4)))]
> +)
> +
> +(define_insn_and_split "aarch64_atomic_loadti_loop"
> +  [(set (match_operand:TI 0 "register_operand" "=&r")
> +    (unspec_volatile:TI
> +      [(match_operand:TI 1 "aarch64_sync_memory_operand" "Q")
> +       (match_operand:SI 2 "const_int_operand")]			;; model
> +      UNSPECV_LDA))
> +   (clobber (reg:CC CC_REGNUM))
> +   (clobber (match_scratch:SI 3 "=&r"))]
> +  "aarch64_handle_128bit_atomics && !AARCH64_ISA_AT"
> +  "#"
> +  "&& reload_completed"
> +  [(const_int 0)]
> +  {
> +    aarch64_split_atomic_ti_access (operands, true);
> +    DONE;
>     }
>   )
>   
> @@ -488,8 +543,7 @@
>         UNSPECV_STL))]
>     ""
>     {
> -    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
> -    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
> +    if (! aarch_mm_needs_release (operands[2]))
>         return "str<atomic_sfx>\t%<w>1, %0";
>       else if (which_alternative == 0)
>         return "stlr<atomic_sfx>\t%<w>1, %0";
> @@ -499,6 +553,61 @@
>     [(set_attr "arch" "*,rcpc8_4")]
>   )
>   
> +(define_expand "atomic_storeti"
> + [(match_operand:TI 0 "aarch64_atomic_TImode_operand" "")
> +  (match_operand:TI 1 "aarch64_reg_or_zero" "")
> +  (match_operand:TI 2 "const_int_operand" "")]
> + "aarch64_handle_128bit_atomics"
> + {
> +    if (AARCH64_ISA_AT)
> +      {
> +	emit_insn (gen_aarch64_atomic_storeti_stp (operands[0], operands[1],
> +						   operands[2]));
> +	DONE;
> +      }
> +
> +    emit_insn (gen_aarch64_atomic_storeti_loop (operands[0], operands[1],
> +						 operands[2]));
> +    DONE;
> + }
> +)
> +
> +(define_insn "aarch64_atomic_storeti_stp"
> +  [(set (match_operand:TI 0 "aarch64_TImode_pair_operand" "=Umn")
> +    (unspec_volatile:TI
> +      [(match_operand:TI 1 "aarch64_reg_or_zero" "rZ")
> +       (match_operand:SI 2 "const_int_operand")]			;; model
> +      UNSPECV_STL)) ]
> +  "aarch64_handle_128bit_atomics && AARCH64_ISA_AT"
> +  {
> +    if (aarch_mm_needs_release (operands[2]))
> +      output_asm_insn ("dmb\tish", operands);
> +    return "stp\t%x1, %H1, %0";
> +  }
> +  [(set (attr "length")
> +	(if_then_else (match_test "aarch_mm_needs_release (operands[2])")
> +		      (const_int 8)
> +		      (const_int 4)))]
> +)
> +
> +(define_insn_and_split "aarch64_atomic_storeti_loop"
> +  [(set (match_operand:TI 0 "aarch64_sync_memory_operand" "=Q")
> +    (unspec_volatile:TI
> +      [(match_operand:TI 1 "aarch64_reg_or_zero" "rZ")
> +       (match_operand:SI 2 "const_int_operand")]			;; model
> +      UNSPECV_STL))
> +   (clobber (reg:CC CC_REGNUM))
> +   (clobber (match_scratch:TI 3 "=&r"))]
> +  "aarch64_handle_128bit_atomics && !AARCH64_ISA_AT"
> +  "#"
> +  "&& reload_completed"
> +  [(const_int 0)]
> +  {
> +    aarch64_split_atomic_ti_access (operands, false);
> +    DONE;
> +  }
> +)
> +
>   (define_insn "@aarch64_load_exclusive<mode>"
>     [(set (match_operand:SI 0 "register_operand" "=r")
>       (zero_extend:SI
> @@ -508,45 +617,52 @@
>   	UNSPECV_LX)))]
>     ""
>     {
> -    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
> -    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
> -      return "ldxr<atomic_sfx>\t%w0, %1";
> -    else
> +    if (aarch_mm_needs_acquire (operands[2]))
>         return "ldaxr<atomic_sfx>\t%w0, %1";
> +    else
> +      return "ldxr<atomic_sfx>\t%w0, %1";
>     }
>   )
>   
>   (define_insn "@aarch64_load_exclusive<mode>"
> -  [(set (match_operand:GPI 0 "register_operand" "=r")
> -    (unspec_volatile:GPI
> -      [(match_operand:GPI 1 "aarch64_sync_memory_operand" "Q")
> +  [(set (match_operand:GPI_TI 0 "register_operand" "=r")
> +    (unspec_volatile:GPI_TI
> +      [(match_operand:GPI_TI 1 "aarch64_sync_memory_operand" "Q")
>          (match_operand:SI 2 "const_int_operand")]
>         UNSPECV_LX))]
>     ""
>     {
> -    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
> -    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
> -      return "ldxr\t%<w>0, %1";
> +    bool acquire_needed = aarch_mm_needs_acquire (operands[2]);
> +    if (GET_MODE (operands[1]) == TImode)
> +      return acquire_needed
> +	     ? "ldaxp\t%0, %H0, %1"
> +	     : "ldxp\t%0, %H0, %1";
>       else
> -      return "ldaxr\t%<w>0, %1";
> +      return acquire_needed
> +	     ? "ldaxr\t%<w>0, %1"
> +	     : "ldxr\t%<w>0, %1";
>     }
>   )
>   
>   (define_insn "@aarch64_store_exclusive<mode>"
>     [(set (match_operand:SI 0 "register_operand" "=&r")
>       (unspec_volatile:SI [(const_int 0)] UNSPECV_SX))
> -   (set (match_operand:ALLI 1 "aarch64_sync_memory_operand" "=Q")
> -    (unspec_volatile:ALLI
> -      [(match_operand:ALLI 2 "aarch64_reg_or_zero" "rZ")
> +   (set (match_operand:ALLI_TI 1 "aarch64_sync_memory_operand" "=Q")
> +    (unspec_volatile:ALLI_TI
> +      [(match_operand:ALLI_TI 2 "aarch64_reg_or_zero" "rZ")
>          (match_operand:SI 3 "const_int_operand")]
>         UNSPECV_SX))]
>     ""
>     {
> -    enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
> -    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
> -      return "stxr<atomic_sfx>\t%w0, %<w>2, %1";
> +    bool release_needed = aarch_mm_needs_release (operands[3]);
> +    if (GET_MODE (operands[1]) == TImode)
> +      return release_needed
> +	     ? "stlxp\t%w0, %x2, %H2, %1"
> +	     : "stxp\t%w0, %x2, %H2, %1";
>       else
> -      return "stlxr<atomic_sfx>\t%w0, %<w>2, %1";
> +      return release_needed
> +	     ? "stlxr<atomic_sfx>\t%w0, %<w>2, %1"
> +	     : "stxr<atomic_sfx>\t%w0, %<w>2, %1";
>     }
>   )
>   
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index a43956054e82aaf651fb45d0ff254b248c02c644..8d3fe29f6e4b9a3a7a6c8fc32c1564ef88501fb4 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -26,6 +26,9 @@
>   ;; Iterator for General Purpose Integer registers (32- and 64-bit modes)
>   (define_mode_iterator GPI [SI DI])
>   
> +;; Iterator for SI, DI, TI.
> +(define_mode_iterator GPI_TI [SI DI TI])
> +
>   ;; Iterator for HI, SI, DI, some instructions can only work on these modes.
>   (define_mode_iterator GPI_I16 [(HI "AARCH64_ISA_F16") SI DI])
>   
> diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
> index 5b08b03c5868c7aa86f8844e3219a6e82717d4f0..a52b8989fe277d2ec3d32ee31d99708cb8ef2ced 100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -389,6 +389,23 @@
>       (match_operand 0 "aarch64_9bit_offset_memory_operand")
>       (match_operand 0 "aarch64_sync_memory_operand")))
>   
> +;; Predicate to accept operands for TImode atomic load/store.
> +;; Depends on the ISA because the AT extension makes LDP/STP atomic, and they
> +;; accept more operands than LDAXP/STLXP.
> +(define_predicate "aarch64_TImode_pair_operand"
> +  (and (match_code "mem")
> +	(ior (match_code "reg" "0")
> +	     (and (match_code "plus" "0")
> +		  (match_code "reg" "00")
> +		  (match_code "const_int" "01")
> +		  (match_test "aarch64_offset_7bit_signed_scaled_p (
> +				DImode, INTVAL (XEXP (XEXP (op, 0), 1)))")))))
> +
> +(define_predicate "aarch64_atomic_TImode_operand"
> +  (if_then_else (match_test "AARCH64_ISA_AT")
> +    (match_operand 0 "aarch64_TImode_pair_operand")
> +    (match_operand 0 "aarch64_sync_memory_operand")))
> +
>   ;; Predicates for parallel expanders based on mode.
>   (define_special_predicate "vect_par_cnst_hi_half"
>     (match_code "parallel")
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 685c211e176d13809078cf0fd595d49763adef25..d1e5cdfcdfb8e00400d422ed6869d60ebe4f03fb 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -15075,6 +15075,26 @@ and 2048.  @samp{scalable} is the default.
>   At present, @samp{-msve-vector-bits=128} produces the same output
>   as @samp{-msve-vector-bits=scalable}.
>   
> +@item -matomic-128bit-instructions
> +@itemx -mno-atomic-128bit-instructions
> +@opindex matomic-128bit-instructions
> +@opindex mno-atomic-128bit-instructions
> +Enable or disable using inline 128 bit atomic loads and stores.
> +Without this flag atomic memory accesses of this size will be handled by
> +libatomic.
> +Inline accesses are faster than calls to libatomic but can interrupt accesses
> +through libatomic, this means that pre-existing code using libatomic is ABI
> +incompatible with code generated using this flag.
> +By default this option is disabled @samp{-mno-atomic-128bit-instructions}.
> +
> +If this flag is used targeting a processor that has the atomicity guarantees on
> +the STP and LDP instructions added in Armv8.4 then GCC will use these
> +instructions, otherwise GCC will generate a load-exclusive/store-exclusive
> +read-write-modify loop.
> +The use of a read-write-modify loop for an atomic load can cause a Segmentation
> +fault when atomically loading a variable that the compiler has put in read-only
> +memory.
> +
>   @end table
>   
>   @subsubsection @option{-march} and @option{-mcpu} Feature Modifiers
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-load128.c b/gcc/testsuite/gcc.target/aarch64/atomic-load128.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..b43599975db69201771adc6695d67da052be75a4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-load128.c
> @@ -0,0 +1,28 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=armv8.4-a -O2 -matomic-128bit-instructions" } */
> +
> +#include <stdlib.h>
> +#include <stdatomic.h>
> +#include <stdint.h>
> +
> +#define RUN_TESTS_NAMED(prefix) \
> +  void \
> +  prefix##128 () \
> +{ \
> +  __int128 *atomic_vals = calloc (4, sizeof (__int128)); \
> +  __int128 temp_val; \
> +  temp_val = atomic_load_explicit (atomic_vals, memory_order_relaxed); \
> +  temp_val = atomic_load_explicit (atomic_vals, memory_order_acquire); \
> +  temp_val = atomic_load_explicit ((atomic_vals + 1), memory_order_acquire); \
> +  temp_val = atomic_load ((atomic_vals + 2)); \
> +  temp_val = atomic_load_explicit ((atomic_vals + 3), memory_order_relaxed); \
> +}
> +
> +RUN_TESTS_NAMED (bar);
> +/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]+, \\\[x\[^\n\]*\n\[ \t\]*dmb\tishld" 3 } } */
> +/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 5 } } */
> +
> +__attribute__ ((target ("arch=armv8.3-a")))
> +RUN_TESTS_NAMED (foo);
> +/* { dg-final { scan-assembler-times "ldxp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 2 } } */
> +/* { dg-final { scan-assembler-times "ldaxp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 3 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> index 8cabc05b0d739dbfdcecf681348b62634fcfc9a4..141e4e317d73b12555163c8352218842d4250a37 100644
> --- a/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> @@ -1,23 +1,7 @@
>   /* { dg-do compile } */
>   /* { dg-options "-march=armv8.4-a -O2" } */
>   
> -#include <stdatomic.h>
> -
> -typedef __INT8_TYPE__ int8_t;
> -typedef __INT16_TYPE__ int16_t;
> -typedef __INT32_TYPE__ int32_t;
> -typedef __INT64_TYPE__ int64_t;
> -
> -#define STORE_TESTS(size) \
> -  void \
> -  foo##size (int##size##_t *atomic_vals) \
> -{ \
> -  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
> -  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
> -  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
> -  atomic_store ((atomic_vals + 2), 2); \
> -  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
> -}
> +#include "atomic-store.x"
>   
>   STORE_TESTS (8);
>   /* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> @@ -26,6 +10,7 @@ STORE_TESTS (8);
>   /* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurb\twzr, \\\[x\[0-9\]+, 4\\\]" 1 } } */
>   
>   STORE_TESTS (16);
>   /* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> @@ -33,6 +18,7 @@ STORE_TESTS (16);
>   /* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\twzr, \\\[x\[0-9\]+, 8\\\]" 1 } } */
>   
>   STORE_TESTS (32);
>   /* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> @@ -40,12 +26,14 @@ STORE_TESTS (32);
>   /* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\twzr, \\\[x\[0-9\]+, 16\\\]" 1 } } */
>   
>   STORE_TESTS (64);
>   /* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\txzr, \\\[x\[0-9\]+, 32\\\]" 1 } } */
>   
>   void
>   foo_toolarge_offset (int64_t *atomic_vals)
> @@ -64,12 +52,20 @@ foo_negative (int8_t *atomic_vals)
>   }
>   /* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 { target { ! ilp32 } } } } */
>   
> -#pragma GCC target ("arch=armv8.3-a")
>   void
> +__attribute__ ((target ("arch=armv8.3-a")))
>   foo_older_arch (int64_t *atomic_vals)
>   {
>     atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
>   }
> -
>   /* Three times, one for each of the three above functions.  */
>   /* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 } } */
> +
> +/* This test is to show that the -matomic-128bit-instructions flag is needed
> + * to handle 128 bit atomic store.  */
> +typedef __int128 int128_t;
> +STORE_TESTS (128);
> +/* { dg-final { scan-assembler-not "dmb\tish\n\[ \t\]*stp" } } */
> +/* { dg-final { scan-assembler-not "stxp" } } */
> +/* { dg-final { scan-assembler-not "stlxp" } } */
> +/* { dg-final { scan-assembler-times "bl?\t__atomic_store_16" 6 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.x b/gcc/testsuite/gcc.target/aarch64/atomic-store.x
> new file mode 100644
> index 0000000000000000000000000000000000000000..5e6261a8d3ec3905b4a850cd33dbd1caa37a186e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.x
> @@ -0,0 +1,20 @@
> +#include <stdatomic.h>
> +
> +typedef __INT8_TYPE__ int8_t;
> +typedef __INT16_TYPE__ int16_t;
> +typedef __INT32_TYPE__ int32_t;
> +typedef __INT64_TYPE__ int64_t;
> +
> +#define STORE_TESTS_NAMED(size, prefix) \
> +void \
> +prefix##size (int##size##_t *atomic_vals) \
> +{ \
> +  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
> +  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
> +  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
> +  atomic_store ((atomic_vals + 2), 2); \
> +  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
> +  atomic_store_explicit (atomic_vals + 4, 0, memory_order_release); \
> +}
> +
> +#define STORE_TESTS(size) STORE_TESTS_NAMED(size, foo)
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store128.c b/gcc/testsuite/gcc.target/aarch64/atomic-store128.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..bed864205128e2d8b6deb856d061ad13667cb14b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store128.c
> @@ -0,0 +1,74 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=armv8.4-a -O2 -matomic-128bit-instructions" } */
> +
> +#include "atomic-store.x"
> +
> +STORE_TESTS (8);
> +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1  { target { ! ilp32 } } } } */
> +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 2  { target { ilp32 } } } } */
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurb\twzr, \\\[x\[0-9\]+, 4\\\]" 1 } } */
> +
> +STORE_TESTS (16);
> +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\twzr, \\\[x\[0-9\]+, 8\\\]" 1 } } */
> +
> +STORE_TESTS (32);
> +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\twzr, \\\[x\[0-9\]+, 16\\\]" 1 } } */
> +
> +STORE_TESTS (64);
> +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\txzr, \\\[x\[0-9\]+, 32\\\]" 1 } } */
> +
> +void
> +foo_toolarge_offset (int64_t *atomic_vals)
> +{
> +  /* 9bit signed unscaled immediate =>
> +	largest representable value +255.
> +	smallest representable value -256.  */
> +  atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
> +  atomic_store_explicit (atomic_vals - 33, 2, memory_order_release);
> +}
> +
> +void
> +foo_negative (int8_t *atomic_vals)
> +{
> +  atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);
> +}
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 { target { ! ilp32 } } } } */
> +
> +void
> +__attribute__ ((target ("arch=armv8.3-a")))
> +foo_older_arch (int64_t *atomic_vals)
> +{
> +  atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
> +}
> +/* Three times, one for each of the three above functions.  */
> +/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 } } */
> +typedef __int128 int128_t;
> +STORE_TESTS (128);
> +/* { dg-final { scan-assembler-times "dmb\tish\n\[ \t\]*stp" 4 } } */
> +/* { dg-final { scan-assembler-times "stp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 5 } } */
> +/* { dg-final { scan-assembler-times "stp\txzr, xzr, \\\[x" 1 } } */
> +
> +
> +__attribute__ ((target ("arch=armv8.3-a")))
> +STORE_TESTS_NAMED (128, bar);
> +/* { dg-final { scan-assembler-times "stxp\tw\[0-9\]+, x\[0-9\]+, x\[0-9\]+, \\\[x" 2 } } */
> +/* { dg-final { scan-assembler-times "stlxp\tw\[0-9\]+, x\[0-9\]+, x\[0-9\]+, \\\[x" 3 } } */
> +/* { dg-final { scan-assembler-times "stlxp\tw\[0-9\]+, xzr, xzr, \\\[x" 1 } } */
>
Matthew Malcomson Dec. 17, 2018, 1:29 p.m. UTC | #2
Ping


On 27/09/18 14:43, Matthew Malcomson wrote:
> [PATCH][GCC][AARCH64] Introduce aarch64 atomic_{load,store}ti patterns
>
> In Armv8.4-a these patterns use the LDP/STP instructions that are guaranteed to
> be single-copy atomic, ensure correct memory ordering semantics by using
> the DMB instruction.
>
> We put the use of these inline expansions behind a command line flag since they
> do not satisfy the libatomic ABI and hence can't be used together with code
> already compiled using 16 byte atomics.
> This command line flag is -matomic-128bit-instructions.
>
> Given the introduction of a flag specified to break ABI compatibility with
> libatomic, it seems reasonable to introduce the load-exclusive/store-exclusive
> read-modify-write loop emulation of 128 bit atomic load and stores for older
> architectures behind this flag.
>
> We introduce the usual extension macros for the "at" extension marking the
> LDP/STP atomicity guarantees introduced in Armv8.4-a and use these to decide
> which to use when -matomic-128bit-instructions is provided on the command line.
>
> Tested with full bootstrap and make check on aarch64-none-linux-gnu.
> Ok for trunk?
>
> gcc/ChangeLog:
>
> 2018-09-27  Matthew Malcomson  <matthew.malcomson@arm.com>
>
> 	* config/aarch64/aarch64-protos.h (aarch64_split_atomic_ti_access): New
> 	prototype.
> 	* config/aarch64/aarch64.c (aarch64_split_atomic_ti_access): New.
> 	* config/aarch64/aarch64.h (AARCH64_FL_AT): New flag.
> 	(AARCH64_FL_PROFILE): Flag moved to accomodate above.
> 	(AARCH64_FL_FOR_ARCH8_4): Include AARCH64_FL_AT.
> 	(AARCH64_ISA_AT): New ISA flag.
> 	* config/aarch64/aarch64.opt (-matomic-128bit-instruction): New.
> 	* config/aarch64/atomics.md (atomic_load<mode>, atomic_store<mode>,
> 	@aarch64_load_exclusive<mode> {smaller registers},
> 	@aarch64_load_exclusive<mode> {GPI registers},
> 	@aarch64_store_exclusive<mode>): Use aarch_mm_needs_{acquire,release}
> 	instead of three part check.
> 	(atomic_loadti, aarch64_atomic_loadti_ldp, aarch64_atomic_loadti_basic
> 	atomic_storeti, aarch64_atomic_storeti_stp,
> 	aarch64_atomic_storeti_basic) New
> 	* config/aarch64/iterators.md (GPI_TI): New.
> 	* config/aarch64/predicates.md (aarch64_atomic_TImode_operand,
> 	aarch64_TImode_pair_operand): New.
> 	* doc/invoke.texi (-matomic-128bit-instructions): Document option.
>
> gcc/testsuite/ChangeLog:
>
> 2018-09-27  Matthew Malcomson  <matthew.malcomson@arm.com>
>
> 	* gcc.target/aarch64/atomic-load128.c: New test.
> 	* gcc.target/aarch64/atomic-store.x: Shared macro for below tests.
> 	* gcc.target/aarch64/atomic-store.c: Use atomic-store.x.
> 	* gcc.target/aarch64/atomic-store128.c: New test using atomic-store.x.
>
>
> ###############     Attachment also inlined for ease of reply    ###############
>
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index caf1d2041f0cac8e3f975f8384a167a90dc638e5..578ea925fac9a7237af3a53e7ec642d0ba8e7b93 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -560,6 +560,8 @@ machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx);
>   rtx aarch64_gen_compare_reg (RTX_CODE, rtx, rtx);
>   rtx aarch64_load_tp (rtx);
>   
> +void aarch64_split_atomic_ti_access (rtx op[], bool);
> +
>   void aarch64_expand_compare_and_swap (rtx op[]);
>   void aarch64_split_compare_and_swap (rtx op[]);
>   void aarch64_gen_atomic_cas (rtx, rtx, rtx, rtx, rtx);
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index e5cdb1d54f4ee96140202ea21a9478438d208f45..c1e407b5a3f27aa7eea9c35e749fe597e79f3e65 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -158,9 +158,10 @@ extern unsigned aarch64_architecture_version;
>   #define AARCH64_FL_SHA3	      (1 << 18)  /* Has ARMv8.4-a SHA3 and SHA512.  */
>   #define AARCH64_FL_F16FML     (1 << 19)  /* Has ARMv8.4-a FP16 extensions.  */
>   #define AARCH64_FL_RCPC8_4    (1 << 20)  /* Has ARMv8.4-a RCPC extensions.  */
> +#define AARCH64_FL_AT         (1 << 21)  /* Has ARMv8.4-a AT extensions.  */
>   
>   /* Statistical Profiling extensions.  */
> -#define AARCH64_FL_PROFILE    (1 << 21)
> +#define AARCH64_FL_PROFILE    (1 << 22)
>   
>   /* Has FP and SIMD.  */
>   #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
> @@ -179,7 +180,7 @@ extern unsigned aarch64_architecture_version;
>     (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3)
>   #define AARCH64_FL_FOR_ARCH8_4			\
>     (AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4 | AARCH64_FL_F16FML \
> -   | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4)
> +   | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4 | AARCH64_FL_AT)
>   
>   /* Macros to test ISA flags.  */
>   
> @@ -201,6 +202,7 @@ extern unsigned aarch64_architecture_version;
>   #define AARCH64_ISA_SHA3	   (aarch64_isa_flags & AARCH64_FL_SHA3)
>   #define AARCH64_ISA_F16FML	   (aarch64_isa_flags & AARCH64_FL_F16FML)
>   #define AARCH64_ISA_RCPC8_4	   (aarch64_isa_flags & AARCH64_FL_RCPC8_4)
> +#define AARCH64_ISA_AT	           (aarch64_isa_flags & AARCH64_FL_AT)
>   
>   /* Crypto is an optional extension to AdvSIMD.  */
>   #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index cbf9d0c09b23712a67a5f0781c247cc859ade18d..7b46ca38a8cf55c6359e2f577bb9e15363dd3132 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -14160,6 +14160,80 @@ aarch64_emit_post_barrier (enum memmodel model)
>       }
>   }
>   
> +/* Emit an emulation of an atomic access for TImode using a load-exclusive
> +   store-exclusive pair.  */
> +void
> +aarch64_split_atomic_ti_access (rtx operands[], bool loading)
> +{
> +  rtx dest, src, model_rtx, scratch;
> +  dest = operands[0];
> +  src = operands[1];
> +  model_rtx = operands[2];
> +  scratch = operands[3];
> +
> +  machine_mode mode = GET_MODE (src);
> +  gcc_assert (mode == TImode || (mode == VOIDmode && src == const0_rtx));
> +
> +  rtx_code_label *label = gen_label_rtx ();
> +  emit_label (label);
> +
> +  rtx scratch_flag;
> +  /* In the below we use the definition that the ordering of sequentially
> +     consistent memory ordering semantics on a load are the same as load-acquire
> +     semantics, and similarly on a store the ordering semantics make the same
> +     requirements as store-release semantics.
> +
> +     Sequentially consistent does provide extra semantics to do with a total
> +     ordering of atomic modifications of memory with sequential consistent
> +     semantics.  That memory ordering requirement is already provided by the
> +     fact that the Armv8 memory model is other-multi-copy atomic (page B2-96 of
> +     the ARM Architecture Reference Manual issue C.a) in combination with the
> +     load-acquire/store-release semantics.
> +
> +     Given that the aim of this instruction is to behave as an
> +     atomic_{load,store}ti these observations demonstrate that we do not need to
> +     provide any special handling for sequentially consistent memory ordering
> +     over and above the handling for load-acquire and store-release
> +     semantics.  */
> +  if (loading)
> +    {
> +      /* For load-acquire semantics we require that no reads or writes can be
> +	 reordered to before the observed load.  Hence all we need is for that
> +	 load to have the required memory ordering semantics.  */
> +      scratch_flag = scratch;
> +      emit_insn (gen_aarch64_load_exclusive (TImode, dest, src, model_rtx));
> +      emit_insn (gen_aarch64_store_exclusive (TImode, scratch_flag, src, dest,
> +					      GEN_INT (MEMMODEL_RELAXED)));
> +    }
> +  else
> +    {
> +      /* For store-release semantics we require that no memory access is
> +	 reordered to after the store-exclusive that is observed.  This is
> +	 satisfied by having that store-exclusive instruction execute with
> +	 store-release memory semantics.  */
> +      emit_insn (gen_aarch64_load_exclusive (TImode, scratch, dest,
> +					     GEN_INT (MEMMODEL_RELAXED)));
> +      scratch_flag = gen_lowpart (SImode, scratch);
> +      emit_insn (gen_aarch64_store_exclusive (TImode, scratch_flag,
> +					      dest, src, model_rtx));
> +    }
> +
> +  rtx x;
> +  if (aarch64_track_speculation)
> +    {
> +      /* Emit an explicit compare instruction, so that we can correctly
> +	 track the condition codes.  */
> +      rtx cc_reg = aarch64_gen_compare_reg (NE, scratch_flag, const0_rtx);
> +      x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
> +    }
> +  else
> +    x = gen_rtx_NE (VOIDmode, scratch_flag, const0_rtx);
> +
> +  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
> +			    gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
> +  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
> +}
> +
>   /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
>      for the data in memory.  EXPECTED is the value expected to be in memory.
>      DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
> diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
> index b2e80cbf6f1f9727c4309874b1122f975fb6b9be..3b1769ba67b6e94cc7c05e051902a8a0f2cfcbe0 100644
> --- a/gcc/config/aarch64/aarch64.opt
> +++ b/gcc/config/aarch64/aarch64.opt
> @@ -218,3 +218,9 @@ Enables verbose cost model dumping in the debug dump files.
>   mtrack-speculation
>   Target Var(aarch64_track_speculation)
>   Generate code to track when the CPU might be speculating incorrectly.
> +
> +matomic-128bit-instructions
> +Target Var(aarch64_handle_128bit_atomics) Init(false)
> +Use architecture atomic operations to handle 128 bit atomic store/load instead
> +of using libatomic.  The use of 128 bit atomics in code compiled with this
> +option is ABI incompatible with that of code compiled without this option.
> diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
> index bba8e9e9c8e61d95fcfb61e650e7e76671c8f996..b47abcf7250aa2045ab8ced52a1373f6d4d71047 100644
> --- a/gcc/config/aarch64/atomics.md
> +++ b/gcc/config/aarch64/atomics.md
> @@ -472,11 +472,66 @@
>         UNSPECV_LDA))]
>     ""
>     {
> -    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
> -    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
> -      return "ldr<atomic_sfx>\t%<w>0, %1";
> -    else
> +    if (aarch_mm_needs_acquire (operands[2]))
>         return "ldar<atomic_sfx>\t%<w>0, %1";
> +    else
> +      return "ldr<atomic_sfx>\t%<w>0, %1";
> +  }
> +)
> +
> +(define_expand "atomic_loadti"
> + [(match_operand:TI 0 "register_operand" "")
> +  (match_operand:TI 1 "aarch64_atomic_TImode_operand" "")
> +  (match_operand:TI 2 "const_int_operand" "")]
> + "aarch64_handle_128bit_atomics"
> + {
> +    if (AARCH64_ISA_AT)
> +      {
> +	emit_insn (gen_aarch64_atomic_loadti_ldp (operands[0], operands[1],
> +						  operands[2]));
> +	DONE;
> +      }
> +
> +    emit_insn (gen_aarch64_atomic_loadti_loop (operands[0], operands[1],
> +						operands[2]));
> +    DONE;
> + }
> +)
> +
> +(define_insn "aarch64_atomic_loadti_ldp"
> +  [(set (match_operand:TI 0 "register_operand" "=r")
> +    (unspec_volatile:TI
> +      [(match_operand:TI 1 "aarch64_TImode_pair_operand" "Umn")
> +       (match_operand:SI 2 "const_int_operand")]			;; model
> +      UNSPECV_LDA))]
> +  "aarch64_handle_128bit_atomics && AARCH64_ISA_AT"
> +  {
> +    output_asm_insn ("ldp\\t%0, %H0, %1", operands);
> +    return aarch_mm_needs_acquire (operands[2])
> +	  ? "dmb\\tishld"
> +	  : "";
> +  }
> +  [(set (attr "length")
> +	(if_then_else (match_test "aarch_mm_needs_acquire (operands[2])")
> +		      (const_int 8)
> +		      (const_int 4)))]
> +)
> +
> +(define_insn_and_split "aarch64_atomic_loadti_loop"
> +  [(set (match_operand:TI 0 "register_operand" "=&r")
> +    (unspec_volatile:TI
> +      [(match_operand:TI 1 "aarch64_sync_memory_operand" "Q")
> +       (match_operand:SI 2 "const_int_operand")]			;; model
> +      UNSPECV_LDA))
> +   (clobber (reg:CC CC_REGNUM))
> +   (clobber (match_scratch:SI 3 "=&r"))]
> +  "aarch64_handle_128bit_atomics && !AARCH64_ISA_AT"
> +  "#"
> +  "&& reload_completed"
> +  [(const_int 0)]
> +  {
> +    aarch64_split_atomic_ti_access (operands, true);
> +    DONE;
>     }
>   )
>   
> @@ -488,8 +543,7 @@
>         UNSPECV_STL))]
>     ""
>     {
> -    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
> -    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
> +    if (! aarch_mm_needs_release (operands[2]))
>         return "str<atomic_sfx>\t%<w>1, %0";
>       else if (which_alternative == 0)
>         return "stlr<atomic_sfx>\t%<w>1, %0";
> @@ -499,6 +553,61 @@
>     [(set_attr "arch" "*,rcpc8_4")]
>   )
>   
> +(define_expand "atomic_storeti"
> + [(match_operand:TI 0 "aarch64_atomic_TImode_operand" "")
> +  (match_operand:TI 1 "aarch64_reg_or_zero" "")
> +  (match_operand:TI 2 "const_int_operand" "")]
> + "aarch64_handle_128bit_atomics"
> + {
> +    if (AARCH64_ISA_AT)
> +      {
> +	emit_insn (gen_aarch64_atomic_storeti_stp (operands[0], operands[1],
> +						   operands[2]));
> +	DONE;
> +      }
> +
> +    emit_insn (gen_aarch64_atomic_storeti_loop (operands[0], operands[1],
> +						 operands[2]));
> +    DONE;
> + }
> +)
> +
> +(define_insn "aarch64_atomic_storeti_stp"
> +  [(set (match_operand:TI 0 "aarch64_TImode_pair_operand" "=Umn")
> +    (unspec_volatile:TI
> +      [(match_operand:TI 1 "aarch64_reg_or_zero" "rZ")
> +       (match_operand:SI 2 "const_int_operand")]			;; model
> +      UNSPECV_STL)) ]
> +  "aarch64_handle_128bit_atomics && AARCH64_ISA_AT"
> +  {
> +    if (aarch_mm_needs_release (operands[2]))
> +      output_asm_insn ("dmb\tish", operands);
> +    return "stp\t%x1, %H1, %0";
> +  }
> +  [(set (attr "length")
> +	(if_then_else (match_test "aarch_mm_needs_release (operands[2])")
> +		      (const_int 8)
> +		      (const_int 4)))]
> +)
> +
> +(define_insn_and_split "aarch64_atomic_storeti_loop"
> +  [(set (match_operand:TI 0 "aarch64_sync_memory_operand" "=Q")
> +    (unspec_volatile:TI
> +      [(match_operand:TI 1 "aarch64_reg_or_zero" "rZ")
> +       (match_operand:SI 2 "const_int_operand")]			;; model
> +      UNSPECV_STL))
> +   (clobber (reg:CC CC_REGNUM))
> +   (clobber (match_scratch:TI 3 "=&r"))]
> +  "aarch64_handle_128bit_atomics && !AARCH64_ISA_AT"
> +  "#"
> +  "&& reload_completed"
> +  [(const_int 0)]
> +  {
> +    aarch64_split_atomic_ti_access (operands, false);
> +    DONE;
> +  }
> +)
> +
>   (define_insn "@aarch64_load_exclusive<mode>"
>     [(set (match_operand:SI 0 "register_operand" "=r")
>       (zero_extend:SI
> @@ -508,45 +617,52 @@
>   	UNSPECV_LX)))]
>     ""
>     {
> -    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
> -    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
> -      return "ldxr<atomic_sfx>\t%w0, %1";
> -    else
> +    if (aarch_mm_needs_acquire (operands[2]))
>         return "ldaxr<atomic_sfx>\t%w0, %1";
> +    else
> +      return "ldxr<atomic_sfx>\t%w0, %1";
>     }
>   )
>   
>   (define_insn "@aarch64_load_exclusive<mode>"
> -  [(set (match_operand:GPI 0 "register_operand" "=r")
> -    (unspec_volatile:GPI
> -      [(match_operand:GPI 1 "aarch64_sync_memory_operand" "Q")
> +  [(set (match_operand:GPI_TI 0 "register_operand" "=r")
> +    (unspec_volatile:GPI_TI
> +      [(match_operand:GPI_TI 1 "aarch64_sync_memory_operand" "Q")
>          (match_operand:SI 2 "const_int_operand")]
>         UNSPECV_LX))]
>     ""
>     {
> -    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
> -    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
> -      return "ldxr\t%<w>0, %1";
> +    bool acquire_needed = aarch_mm_needs_acquire (operands[2]);
> +    if (GET_MODE (operands[1]) == TImode)
> +      return acquire_needed
> +	     ? "ldaxp\t%0, %H0, %1"
> +	     : "ldxp\t%0, %H0, %1";
>       else
> -      return "ldaxr\t%<w>0, %1";
> +      return acquire_needed
> +	     ? "ldaxr\t%<w>0, %1"
> +	     : "ldxr\t%<w>0, %1";
>     }
>   )
>   
>   (define_insn "@aarch64_store_exclusive<mode>"
>     [(set (match_operand:SI 0 "register_operand" "=&r")
>       (unspec_volatile:SI [(const_int 0)] UNSPECV_SX))
> -   (set (match_operand:ALLI 1 "aarch64_sync_memory_operand" "=Q")
> -    (unspec_volatile:ALLI
> -      [(match_operand:ALLI 2 "aarch64_reg_or_zero" "rZ")
> +   (set (match_operand:ALLI_TI 1 "aarch64_sync_memory_operand" "=Q")
> +    (unspec_volatile:ALLI_TI
> +      [(match_operand:ALLI_TI 2 "aarch64_reg_or_zero" "rZ")
>          (match_operand:SI 3 "const_int_operand")]
>         UNSPECV_SX))]
>     ""
>     {
> -    enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
> -    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
> -      return "stxr<atomic_sfx>\t%w0, %<w>2, %1";
> +    bool release_needed = aarch_mm_needs_release (operands[3]);
> +    if (GET_MODE (operands[1]) == TImode)
> +      return release_needed
> +	     ? "stlxp\t%w0, %x2, %H2, %1"
> +	     : "stxp\t%w0, %x2, %H2, %1";
>       else
> -      return "stlxr<atomic_sfx>\t%w0, %<w>2, %1";
> +      return release_needed
> +	     ? "stlxr<atomic_sfx>\t%w0, %<w>2, %1"
> +	     : "stxr<atomic_sfx>\t%w0, %<w>2, %1";
>     }
>   )
>   
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index a43956054e82aaf651fb45d0ff254b248c02c644..8d3fe29f6e4b9a3a7a6c8fc32c1564ef88501fb4 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -26,6 +26,9 @@
>   ;; Iterator for General Purpose Integer registers (32- and 64-bit modes)
>   (define_mode_iterator GPI [SI DI])
>   
> +;; Iterator for SI, DI, TI.
> +(define_mode_iterator GPI_TI [SI DI TI])
> +
>   ;; Iterator for HI, SI, DI, some instructions can only work on these modes.
>   (define_mode_iterator GPI_I16 [(HI "AARCH64_ISA_F16") SI DI])
>   
> diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
> index 5b08b03c5868c7aa86f8844e3219a6e82717d4f0..a52b8989fe277d2ec3d32ee31d99708cb8ef2ced 100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -389,6 +389,23 @@
>       (match_operand 0 "aarch64_9bit_offset_memory_operand")
>       (match_operand 0 "aarch64_sync_memory_operand")))
>   
> +;; Predicate to accept operands for TImode atomic load/store.
> +;; Depends on the ISA because the AT extension makes LDP/STP atomic, and they
> +;; accept more operands than LDAXP/STLXP.
> +(define_predicate "aarch64_TImode_pair_operand"
> +  (and (match_code "mem")
> +	(ior (match_code "reg" "0")
> +	     (and (match_code "plus" "0")
> +		  (match_code "reg" "00")
> +		  (match_code "const_int" "01")
> +		  (match_test "aarch64_offset_7bit_signed_scaled_p (
> +				DImode, INTVAL (XEXP (XEXP (op, 0), 1)))")))))
> +
> +(define_predicate "aarch64_atomic_TImode_operand"
> +  (if_then_else (match_test "AARCH64_ISA_AT")
> +    (match_operand 0 "aarch64_TImode_pair_operand")
> +    (match_operand 0 "aarch64_sync_memory_operand")))
> +
>   ;; Predicates for parallel expanders based on mode.
>   (define_special_predicate "vect_par_cnst_hi_half"
>     (match_code "parallel")
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 685c211e176d13809078cf0fd595d49763adef25..d1e5cdfcdfb8e00400d422ed6869d60ebe4f03fb 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -15075,6 +15075,26 @@ and 2048.  @samp{scalable} is the default.
>   At present, @samp{-msve-vector-bits=128} produces the same output
>   as @samp{-msve-vector-bits=scalable}.
>   
> +@item -matomic-128bit-instructions
> +@itemx -mno-atomic-128bit-instructions
> +@opindex matomic-128bit-instructions
> +@opindex mno-atomic-128bit-instructions
> +Enable or disable using inline 128 bit atomic loads and stores.
> +Without this flag atomic memory accesses of this size will be handled by
> +libatomic.
> +Inline accesses are faster than calls to libatomic but can interrupt accesses
> +through libatomic, this means that pre-existing code using libatomic is ABI
> +incompatible with code generated using this flag.
> +By default this option is disabled @samp{-mno-atomic-128bit-instructions}.
> +
> +If this flag is used targeting a processor that has the atomicity guarantees on
> +the STP and LDP instructions added in Armv8.4 then GCC will use these
> +instructions, otherwise GCC will generate a load-exclusive/store-exclusive
> +read-write-modify loop.
> +The use of a read-write-modify loop for an atomic load can cause a Segmentation
> +fault when atomically loading a variable that the compiler has put in read-only
> +memory.
> +
>   @end table
>   
>   @subsubsection @option{-march} and @option{-mcpu} Feature Modifiers
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-load128.c b/gcc/testsuite/gcc.target/aarch64/atomic-load128.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..b43599975db69201771adc6695d67da052be75a4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-load128.c
> @@ -0,0 +1,28 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=armv8.4-a -O2 -matomic-128bit-instructions" } */
> +
> +#include <stdlib.h>
> +#include <stdatomic.h>
> +#include <stdint.h>
> +
> +#define RUN_TESTS_NAMED(prefix) \
> +  void \
> +  prefix##128 () \
> +{ \
> +  __int128 *atomic_vals = calloc (4, sizeof (__int128)); \
> +  __int128 temp_val; \
> +  temp_val = atomic_load_explicit (atomic_vals, memory_order_relaxed); \
> +  temp_val = atomic_load_explicit (atomic_vals, memory_order_acquire); \
> +  temp_val = atomic_load_explicit ((atomic_vals + 1), memory_order_acquire); \
> +  temp_val = atomic_load ((atomic_vals + 2)); \
> +  temp_val = atomic_load_explicit ((atomic_vals + 3), memory_order_relaxed); \
> +}
> +
> +RUN_TESTS_NAMED (bar);
> +/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]+, \\\[x\[^\n\]*\n\[ \t\]*dmb\tishld" 3 } } */
> +/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 5 } } */
> +
> +__attribute__ ((target ("arch=armv8.3-a")))
> +RUN_TESTS_NAMED (foo);
> +/* { dg-final { scan-assembler-times "ldxp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 2 } } */
> +/* { dg-final { scan-assembler-times "ldaxp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 3 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> index 8cabc05b0d739dbfdcecf681348b62634fcfc9a4..141e4e317d73b12555163c8352218842d4250a37 100644
> --- a/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> @@ -1,23 +1,7 @@
>   /* { dg-do compile } */
>   /* { dg-options "-march=armv8.4-a -O2" } */
>   
> -#include <stdatomic.h>
> -
> -typedef __INT8_TYPE__ int8_t;
> -typedef __INT16_TYPE__ int16_t;
> -typedef __INT32_TYPE__ int32_t;
> -typedef __INT64_TYPE__ int64_t;
> -
> -#define STORE_TESTS(size) \
> -  void \
> -  foo##size (int##size##_t *atomic_vals) \
> -{ \
> -  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
> -  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
> -  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
> -  atomic_store ((atomic_vals + 2), 2); \
> -  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
> -}
> +#include "atomic-store.x"
>   
>   STORE_TESTS (8);
>   /* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> @@ -26,6 +10,7 @@ STORE_TESTS (8);
>   /* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurb\twzr, \\\[x\[0-9\]+, 4\\\]" 1 } } */
>   
>   STORE_TESTS (16);
>   /* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> @@ -33,6 +18,7 @@ STORE_TESTS (16);
>   /* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\twzr, \\\[x\[0-9\]+, 8\\\]" 1 } } */
>   
>   STORE_TESTS (32);
>   /* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> @@ -40,12 +26,14 @@ STORE_TESTS (32);
>   /* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\twzr, \\\[x\[0-9\]+, 16\\\]" 1 } } */
>   
>   STORE_TESTS (64);
>   /* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\txzr, \\\[x\[0-9\]+, 32\\\]" 1 } } */
>   
>   void
>   foo_toolarge_offset (int64_t *atomic_vals)
> @@ -64,12 +52,20 @@ foo_negative (int8_t *atomic_vals)
>   }
>   /* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 { target { ! ilp32 } } } } */
>   
> -#pragma GCC target ("arch=armv8.3-a")
>   void
> +__attribute__ ((target ("arch=armv8.3-a")))
>   foo_older_arch (int64_t *atomic_vals)
>   {
>     atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
>   }
> -
>   /* Three times, one for each of the three above functions.  */
>   /* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 } } */
> +
> +/* This test is to show that the -matomic-128bit-instructions flag is needed
> + * to handle 128 bit atomic store.  */
> +typedef __int128 int128_t;
> +STORE_TESTS (128);
> +/* { dg-final { scan-assembler-not "dmb\tish\n\[ \t\]*stp" } } */
> +/* { dg-final { scan-assembler-not "stxp" } } */
> +/* { dg-final { scan-assembler-not "stlxp" } } */
> +/* { dg-final { scan-assembler-times "bl?\t__atomic_store_16" 6 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.x b/gcc/testsuite/gcc.target/aarch64/atomic-store.x
> new file mode 100644
> index 0000000000000000000000000000000000000000..5e6261a8d3ec3905b4a850cd33dbd1caa37a186e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.x
> @@ -0,0 +1,20 @@
> +#include <stdatomic.h>
> +
> +typedef __INT8_TYPE__ int8_t;
> +typedef __INT16_TYPE__ int16_t;
> +typedef __INT32_TYPE__ int32_t;
> +typedef __INT64_TYPE__ int64_t;
> +
> +#define STORE_TESTS_NAMED(size, prefix) \
> +void \
> +prefix##size (int##size##_t *atomic_vals) \
> +{ \
> +  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
> +  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
> +  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
> +  atomic_store ((atomic_vals + 2), 2); \
> +  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
> +  atomic_store_explicit (atomic_vals + 4, 0, memory_order_release); \
> +}
> +
> +#define STORE_TESTS(size) STORE_TESTS_NAMED(size, foo)
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store128.c b/gcc/testsuite/gcc.target/aarch64/atomic-store128.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..bed864205128e2d8b6deb856d061ad13667cb14b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store128.c
> @@ -0,0 +1,74 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=armv8.4-a -O2 -matomic-128bit-instructions" } */
> +
> +#include "atomic-store.x"
> +
> +STORE_TESTS (8);
> +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1  { target { ! ilp32 } } } } */
> +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 2  { target { ilp32 } } } } */
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurb\twzr, \\\[x\[0-9\]+, 4\\\]" 1 } } */
> +
> +STORE_TESTS (16);
> +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\twzr, \\\[x\[0-9\]+, 8\\\]" 1 } } */
> +
> +STORE_TESTS (32);
> +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\twzr, \\\[x\[0-9\]+, 16\\\]" 1 } } */
> +
> +STORE_TESTS (64);
> +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\txzr, \\\[x\[0-9\]+, 32\\\]" 1 } } */
> +
> +void
> +foo_toolarge_offset (int64_t *atomic_vals)
> +{
> +  /* 9bit signed unscaled immediate =>
> +	largest representable value +255.
> +	smallest representable value -256.  */
> +  atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
> +  atomic_store_explicit (atomic_vals - 33, 2, memory_order_release);
> +}
> +
> +void
> +foo_negative (int8_t *atomic_vals)
> +{
> +  atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);
> +}
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 { target { ! ilp32 } } } } */
> +
> +void
> +__attribute__ ((target ("arch=armv8.3-a")))
> +foo_older_arch (int64_t *atomic_vals)
> +{
> +  atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
> +}
> +/* Three times, one for each of the three above functions.  */
> +/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 } } */
> +typedef __int128 int128_t;
> +STORE_TESTS (128);
> +/* { dg-final { scan-assembler-times "dmb\tish\n\[ \t\]*stp" 4 } } */
> +/* { dg-final { scan-assembler-times "stp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 5 } } */
> +/* { dg-final { scan-assembler-times "stp\txzr, xzr, \\\[x" 1 } } */
> +
> +
> +__attribute__ ((target ("arch=armv8.3-a")))
> +STORE_TESTS_NAMED (128, bar);
> +/* { dg-final { scan-assembler-times "stxp\tw\[0-9\]+, x\[0-9\]+, x\[0-9\]+, \\\[x" 2 } } */
> +/* { dg-final { scan-assembler-times "stlxp\tw\[0-9\]+, x\[0-9\]+, x\[0-9\]+, \\\[x" 3 } } */
> +/* { dg-final { scan-assembler-times "stlxp\tw\[0-9\]+, xzr, xzr, \\\[x" 1 } } */
>
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index caf1d2041f0cac8e3f975f8384a167a90dc638e5..578ea925fac9a7237af3a53e7ec642d0ba8e7b93 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -560,6 +560,8 @@  machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx);
 rtx aarch64_gen_compare_reg (RTX_CODE, rtx, rtx);
 rtx aarch64_load_tp (rtx);
 
+void aarch64_split_atomic_ti_access (rtx op[], bool);
+
 void aarch64_expand_compare_and_swap (rtx op[]);
 void aarch64_split_compare_and_swap (rtx op[]);
 void aarch64_gen_atomic_cas (rtx, rtx, rtx, rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index e5cdb1d54f4ee96140202ea21a9478438d208f45..c1e407b5a3f27aa7eea9c35e749fe597e79f3e65 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -158,9 +158,10 @@  extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_SHA3	      (1 << 18)  /* Has ARMv8.4-a SHA3 and SHA512.  */
 #define AARCH64_FL_F16FML     (1 << 19)  /* Has ARMv8.4-a FP16 extensions.  */
 #define AARCH64_FL_RCPC8_4    (1 << 20)  /* Has ARMv8.4-a RCPC extensions.  */
+#define AARCH64_FL_AT         (1 << 21)  /* Has ARMv8.4-a AT extensions.  */
 
 /* Statistical Profiling extensions.  */
-#define AARCH64_FL_PROFILE    (1 << 21)
+#define AARCH64_FL_PROFILE    (1 << 22)
 
 /* Has FP and SIMD.  */
 #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
@@ -179,7 +180,7 @@  extern unsigned aarch64_architecture_version;
   (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3)
 #define AARCH64_FL_FOR_ARCH8_4			\
   (AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4 | AARCH64_FL_F16FML \
-   | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4)
+   | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4 | AARCH64_FL_AT)
 
 /* Macros to test ISA flags.  */
 
@@ -201,6 +202,7 @@  extern unsigned aarch64_architecture_version;
 #define AARCH64_ISA_SHA3	   (aarch64_isa_flags & AARCH64_FL_SHA3)
 #define AARCH64_ISA_F16FML	   (aarch64_isa_flags & AARCH64_FL_F16FML)
 #define AARCH64_ISA_RCPC8_4	   (aarch64_isa_flags & AARCH64_FL_RCPC8_4)
+#define AARCH64_ISA_AT	           (aarch64_isa_flags & AARCH64_FL_AT)
 
 /* Crypto is an optional extension to AdvSIMD.  */
 #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index cbf9d0c09b23712a67a5f0781c247cc859ade18d..7b46ca38a8cf55c6359e2f577bb9e15363dd3132 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14160,6 +14160,80 @@  aarch64_emit_post_barrier (enum memmodel model)
     }
 }
 
+/* Emit an emulation of an atomic access for TImode using a load-exclusive
+   store-exclusive pair.  */
+void
+aarch64_split_atomic_ti_access (rtx operands[], bool loading)
+{
+  rtx dest, src, model_rtx, scratch;
+  dest = operands[0];
+  src = operands[1];
+  model_rtx = operands[2];
+  scratch = operands[3];
+
+  machine_mode mode = GET_MODE (src);
+  gcc_assert (mode == TImode || (mode == VOIDmode && src == const0_rtx));
+
+  rtx_code_label *label = gen_label_rtx ();
+  emit_label (label);
+
+  rtx scratch_flag;
+  /* In the below we use the definition that the ordering of sequentially
+     consistent memory ordering semantics on a load are the same as load-acquire
+     semantics, and similarly on a store the ordering semantics make the same
+     requirements as store-release semantics.
+
+     Sequentially consistent does provide extra semantics to do with a total
+     ordering of atomic modifications of memory with sequential consistent
+     semantics.  That memory ordering requirement is already provided by the
+     fact that the Armv8 memory model is other-multi-copy atomic (page B2-96 of
+     the ARM Architecture Reference Manual issue C.a) in combination with the
+     load-acquire/store-release semantics.
+
+     Given that the aim of this instruction is to behave as an
+     atomic_{load,store}ti these observations demonstrate that we do not need to
+     provide any special handling for sequentially consistent memory ordering
+     over and above the handling for load-acquire and store-release
+     semantics.  */
+  if (loading)
+    {
+      /* For load-acquire semantics we require that no reads or writes can be
+	 reordered to before the observed load.  Hence all we need is for that
+	 load to have the required memory ordering semantics.  */
+      scratch_flag = scratch;
+      emit_insn (gen_aarch64_load_exclusive (TImode, dest, src, model_rtx));
+      emit_insn (gen_aarch64_store_exclusive (TImode, scratch_flag, src, dest,
+					      GEN_INT (MEMMODEL_RELAXED)));
+    }
+  else
+    {
+      /* For store-release semantics we require that no memory access is
+	 reordered to after the store-exclusive that is observed.  This is
+	 satisfied by having that store-exclusive instruction execute with
+	 store-release memory semantics.  */
+      emit_insn (gen_aarch64_load_exclusive (TImode, scratch, dest,
+					     GEN_INT (MEMMODEL_RELAXED)));
+      scratch_flag = gen_lowpart (SImode, scratch);
+      emit_insn (gen_aarch64_store_exclusive (TImode, scratch_flag,
+					      dest, src, model_rtx));
+    }
+
+  rtx x;
+  if (aarch64_track_speculation)
+    {
+      /* Emit an explicit compare instruction, so that we can correctly
+	 track the condition codes.  */
+      rtx cc_reg = aarch64_gen_compare_reg (NE, scratch_flag, const0_rtx);
+      x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
+    }
+  else
+    x = gen_rtx_NE (VOIDmode, scratch_flag, const0_rtx);
+
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+			    gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
+  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+}
+
 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
    for the data in memory.  EXPECTED is the value expected to be in memory.
    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index b2e80cbf6f1f9727c4309874b1122f975fb6b9be..3b1769ba67b6e94cc7c05e051902a8a0f2cfcbe0 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -218,3 +218,9 @@  Enables verbose cost model dumping in the debug dump files.
 mtrack-speculation
 Target Var(aarch64_track_speculation)
 Generate code to track when the CPU might be speculating incorrectly.
+
+matomic-128bit-instructions
+Target Var(aarch64_handle_128bit_atomics) Init(false)
+Use architecture atomic operations to handle 128 bit atomic store/load instead
+of using libatomic.  The use of 128 bit atomics in code compiled with this
+option is ABI incompatible with that of code compiled without this option.
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index bba8e9e9c8e61d95fcfb61e650e7e76671c8f996..b47abcf7250aa2045ab8ced52a1373f6d4d71047 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -472,11 +472,66 @@ 
       UNSPECV_LDA))]
   ""
   {
-    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
-    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
-      return "ldr<atomic_sfx>\t%<w>0, %1";
-    else
+    if (aarch_mm_needs_acquire (operands[2]))
       return "ldar<atomic_sfx>\t%<w>0, %1";
+    else
+      return "ldr<atomic_sfx>\t%<w>0, %1";
+  }
+)
+
+(define_expand "atomic_loadti"
+ [(match_operand:TI 0 "register_operand" "")
+  (match_operand:TI 1 "aarch64_atomic_TImode_operand" "")
+  (match_operand:TI 2 "const_int_operand" "")]
+ "aarch64_handle_128bit_atomics"
+ {
+    if (AARCH64_ISA_AT)
+      {
+	emit_insn (gen_aarch64_atomic_loadti_ldp (operands[0], operands[1],
+						  operands[2]));
+	DONE;
+      }
+
+    emit_insn (gen_aarch64_atomic_loadti_loop (operands[0], operands[1],
+						operands[2]));
+    DONE;
+ }
+)
+
+(define_insn "aarch64_atomic_loadti_ldp"
+  [(set (match_operand:TI 0 "register_operand" "=r")
+    (unspec_volatile:TI
+      [(match_operand:TI 1 "aarch64_TImode_pair_operand" "Umn")
+       (match_operand:SI 2 "const_int_operand")]			;; model
+      UNSPECV_LDA))]
+  "aarch64_handle_128bit_atomics && AARCH64_ISA_AT"
+  {
+    output_asm_insn ("ldp\\t%0, %H0, %1", operands);
+    return aarch_mm_needs_acquire (operands[2])
+	  ? "dmb\\tishld"
+	  : "";
+  }
+  [(set (attr "length")
+	(if_then_else (match_test "aarch_mm_needs_acquire (operands[2])")
+		      (const_int 8)
+		      (const_int 4)))]
+)
+
+(define_insn_and_split "aarch64_atomic_loadti_loop"
+  [(set (match_operand:TI 0 "register_operand" "=&r")
+    (unspec_volatile:TI
+      [(match_operand:TI 1 "aarch64_sync_memory_operand" "Q")
+       (match_operand:SI 2 "const_int_operand")]			;; model
+      UNSPECV_LDA))
+   (clobber (reg:CC CC_REGNUM))
+   (clobber (match_scratch:SI 3 "=&r"))]
+  "aarch64_handle_128bit_atomics && !AARCH64_ISA_AT"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  {
+    aarch64_split_atomic_ti_access (operands, true);
+    DONE;
   }
 )
 
@@ -488,8 +543,7 @@ 
       UNSPECV_STL))]
   ""
   {
-    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
-    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
+    if (! aarch_mm_needs_release (operands[2]))
       return "str<atomic_sfx>\t%<w>1, %0";
     else if (which_alternative == 0)
       return "stlr<atomic_sfx>\t%<w>1, %0";
@@ -499,6 +553,61 @@ 
   [(set_attr "arch" "*,rcpc8_4")]
 )
 
+(define_expand "atomic_storeti"
+ [(match_operand:TI 0 "aarch64_atomic_TImode_operand" "")
+  (match_operand:TI 1 "aarch64_reg_or_zero" "")
+  (match_operand:TI 2 "const_int_operand" "")]
+ "aarch64_handle_128bit_atomics"
+ {
+    if (AARCH64_ISA_AT)
+      {
+	emit_insn (gen_aarch64_atomic_storeti_stp (operands[0], operands[1],
+						   operands[2]));
+	DONE;
+      }
+
+    emit_insn (gen_aarch64_atomic_storeti_loop (operands[0], operands[1],
+						 operands[2]));
+    DONE;
+ }
+)
+
+(define_insn "aarch64_atomic_storeti_stp"
+  [(set (match_operand:TI 0 "aarch64_TImode_pair_operand" "=Umn")
+    (unspec_volatile:TI
+      [(match_operand:TI 1 "aarch64_reg_or_zero" "rZ")
+       (match_operand:SI 2 "const_int_operand")]			;; model
+      UNSPECV_STL)) ]
+  "aarch64_handle_128bit_atomics && AARCH64_ISA_AT"
+  {
+    if (aarch_mm_needs_release (operands[2]))
+      output_asm_insn ("dmb\tish", operands);
+    return "stp\t%x1, %H1, %0";
+  }
+  [(set (attr "length")
+	(if_then_else (match_test "aarch_mm_needs_release (operands[2])")
+		      (const_int 8)
+		      (const_int 4)))]
+)
+
+(define_insn_and_split "aarch64_atomic_storeti_loop"
+  [(set (match_operand:TI 0 "aarch64_sync_memory_operand" "=Q")
+    (unspec_volatile:TI
+      [(match_operand:TI 1 "aarch64_reg_or_zero" "rZ")
+       (match_operand:SI 2 "const_int_operand")]			;; model
+      UNSPECV_STL))
+   (clobber (reg:CC CC_REGNUM))
+   (clobber (match_scratch:TI 3 "=&r"))]
+  "aarch64_handle_128bit_atomics && !AARCH64_ISA_AT"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  {
+    aarch64_split_atomic_ti_access (operands, false);
+    DONE;
+  }
+)
+
 (define_insn "@aarch64_load_exclusive<mode>"
   [(set (match_operand:SI 0 "register_operand" "=r")
     (zero_extend:SI
@@ -508,45 +617,52 @@ 
 	UNSPECV_LX)))]
   ""
   {
-    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
-    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
-      return "ldxr<atomic_sfx>\t%w0, %1";
-    else
+    if (aarch_mm_needs_acquire (operands[2]))
       return "ldaxr<atomic_sfx>\t%w0, %1";
+    else
+      return "ldxr<atomic_sfx>\t%w0, %1";
   }
 )
 
 (define_insn "@aarch64_load_exclusive<mode>"
-  [(set (match_operand:GPI 0 "register_operand" "=r")
-    (unspec_volatile:GPI
-      [(match_operand:GPI 1 "aarch64_sync_memory_operand" "Q")
+  [(set (match_operand:GPI_TI 0 "register_operand" "=r")
+    (unspec_volatile:GPI_TI
+      [(match_operand:GPI_TI 1 "aarch64_sync_memory_operand" "Q")
        (match_operand:SI 2 "const_int_operand")]
       UNSPECV_LX))]
   ""
   {
-    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
-    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
-      return "ldxr\t%<w>0, %1";
+    bool acquire_needed = aarch_mm_needs_acquire (operands[2]);
+    if (GET_MODE (operands[1]) == TImode)
+      return acquire_needed
+	     ? "ldaxp\t%0, %H0, %1"
+	     : "ldxp\t%0, %H0, %1";
     else
-      return "ldaxr\t%<w>0, %1";
+      return acquire_needed
+	     ? "ldaxr\t%<w>0, %1"
+	     : "ldxr\t%<w>0, %1";
   }
 )
 
 (define_insn "@aarch64_store_exclusive<mode>"
   [(set (match_operand:SI 0 "register_operand" "=&r")
     (unspec_volatile:SI [(const_int 0)] UNSPECV_SX))
-   (set (match_operand:ALLI 1 "aarch64_sync_memory_operand" "=Q")
-    (unspec_volatile:ALLI
-      [(match_operand:ALLI 2 "aarch64_reg_or_zero" "rZ")
+   (set (match_operand:ALLI_TI 1 "aarch64_sync_memory_operand" "=Q")
+    (unspec_volatile:ALLI_TI
+      [(match_operand:ALLI_TI 2 "aarch64_reg_or_zero" "rZ")
        (match_operand:SI 3 "const_int_operand")]
       UNSPECV_SX))]
   ""
   {
-    enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
-    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
-      return "stxr<atomic_sfx>\t%w0, %<w>2, %1";
+    bool release_needed = aarch_mm_needs_release (operands[3]);
+    if (GET_MODE (operands[1]) == TImode)
+      return release_needed
+	     ? "stlxp\t%w0, %x2, %H2, %1"
+	     : "stxp\t%w0, %x2, %H2, %1";
     else
-      return "stlxr<atomic_sfx>\t%w0, %<w>2, %1";
+      return release_needed
+	     ? "stlxr<atomic_sfx>\t%w0, %<w>2, %1"
+	     : "stxr<atomic_sfx>\t%w0, %<w>2, %1";
   }
 )
 
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index a43956054e82aaf651fb45d0ff254b248c02c644..8d3fe29f6e4b9a3a7a6c8fc32c1564ef88501fb4 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -26,6 +26,9 @@ 
 ;; Iterator for General Purpose Integer registers (32- and 64-bit modes)
 (define_mode_iterator GPI [SI DI])
 
+;; Iterator for SI, DI, TI.
+(define_mode_iterator GPI_TI [SI DI TI])
+
 ;; Iterator for HI, SI, DI, some instructions can only work on these modes.
 (define_mode_iterator GPI_I16 [(HI "AARCH64_ISA_F16") SI DI])
 
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 5b08b03c5868c7aa86f8844e3219a6e82717d4f0..a52b8989fe277d2ec3d32ee31d99708cb8ef2ced 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -389,6 +389,23 @@ 
     (match_operand 0 "aarch64_9bit_offset_memory_operand")
     (match_operand 0 "aarch64_sync_memory_operand")))
 
+;; Predicate to accept operands for TImode atomic load/store.
+;; Depends on the ISA because the AT extension makes LDP/STP atomic, and they
+;; accept more operands than LDAXP/STLXP.
+(define_predicate "aarch64_TImode_pair_operand"
+  (and (match_code "mem")
+	(ior (match_code "reg" "0")
+	     (and (match_code "plus" "0")
+		  (match_code "reg" "00")
+		  (match_code "const_int" "01")
+		  (match_test "aarch64_offset_7bit_signed_scaled_p (
+				DImode, INTVAL (XEXP (XEXP (op, 0), 1)))")))))
+
+(define_predicate "aarch64_atomic_TImode_operand"
+  (if_then_else (match_test "AARCH64_ISA_AT")
+    (match_operand 0 "aarch64_TImode_pair_operand")
+    (match_operand 0 "aarch64_sync_memory_operand")))
+
 ;; Predicates for parallel expanders based on mode.
 (define_special_predicate "vect_par_cnst_hi_half"
   (match_code "parallel")
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 685c211e176d13809078cf0fd595d49763adef25..d1e5cdfcdfb8e00400d422ed6869d60ebe4f03fb 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -15075,6 +15075,26 @@  and 2048.  @samp{scalable} is the default.
 At present, @samp{-msve-vector-bits=128} produces the same output
 as @samp{-msve-vector-bits=scalable}.
 
+@item -matomic-128bit-instructions
+@itemx -mno-atomic-128bit-instructions
+@opindex matomic-128bit-instructions
+@opindex mno-atomic-128bit-instructions
+Enable or disable using inline 128 bit atomic loads and stores.
+Without this flag atomic memory accesses of this size will be handled by
+libatomic.
+Inline accesses are faster than calls to libatomic but can interrupt accesses
+through libatomic, this means that pre-existing code using libatomic is ABI
+incompatible with code generated using this flag.
+By default this option is disabled @samp{-mno-atomic-128bit-instructions}.
+
+If this flag is used targeting a processor that has the atomicity guarantees on
+the STP and LDP instructions added in Armv8.4 then GCC will use these
+instructions, otherwise GCC will generate a load-exclusive/store-exclusive
+read-write-modify loop.
+The use of a read-write-modify loop for an atomic load can cause a Segmentation
+fault when atomically loading a variable that the compiler has put in read-only
+memory.
+
 @end table
 
 @subsubsection @option{-march} and @option{-mcpu} Feature Modifiers
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-load128.c b/gcc/testsuite/gcc.target/aarch64/atomic-load128.c
new file mode 100644
index 0000000000000000000000000000000000000000..b43599975db69201771adc6695d67da052be75a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-load128.c
@@ -0,0 +1,28 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.4-a -O2 -matomic-128bit-instructions" } */
+
+#include <stdlib.h>
+#include <stdatomic.h>
+#include <stdint.h>
+
+#define RUN_TESTS_NAMED(prefix) \
+  void \
+  prefix##128 () \
+{ \
+  __int128 *atomic_vals = calloc (4, sizeof (__int128)); \
+  __int128 temp_val; \
+  temp_val = atomic_load_explicit (atomic_vals, memory_order_relaxed); \
+  temp_val = atomic_load_explicit (atomic_vals, memory_order_acquire); \
+  temp_val = atomic_load_explicit ((atomic_vals + 1), memory_order_acquire); \
+  temp_val = atomic_load ((atomic_vals + 2)); \
+  temp_val = atomic_load_explicit ((atomic_vals + 3), memory_order_relaxed); \
+}
+
+RUN_TESTS_NAMED (bar);
+/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]+, \\\[x\[^\n\]*\n\[ \t\]*dmb\tishld" 3 } } */
+/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 5 } } */
+
+__attribute__ ((target ("arch=armv8.3-a")))
+RUN_TESTS_NAMED (foo);
+/* { dg-final { scan-assembler-times "ldxp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 2 } } */
+/* { dg-final { scan-assembler-times "ldaxp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
index 8cabc05b0d739dbfdcecf681348b62634fcfc9a4..141e4e317d73b12555163c8352218842d4250a37 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-store.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
@@ -1,23 +1,7 @@ 
 /* { dg-do compile } */
 /* { dg-options "-march=armv8.4-a -O2" } */
 
-#include <stdatomic.h>
-
-typedef __INT8_TYPE__ int8_t;
-typedef __INT16_TYPE__ int16_t;
-typedef __INT32_TYPE__ int32_t;
-typedef __INT64_TYPE__ int64_t;
-
-#define STORE_TESTS(size) \
-  void \
-  foo##size (int##size##_t *atomic_vals) \
-{ \
-  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
-  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
-  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
-  atomic_store ((atomic_vals + 2), 2); \
-  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
-}
+#include "atomic-store.x"
 
 STORE_TESTS (8);
 /* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
@@ -26,6 +10,7 @@  STORE_TESTS (8);
 /* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */
 /* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
 /* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurb\twzr, \\\[x\[0-9\]+, 4\\\]" 1 } } */
 
 STORE_TESTS (16);
 /* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
@@ -33,6 +18,7 @@  STORE_TESTS (16);
 /* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
 /* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
 /* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\twzr, \\\[x\[0-9\]+, 8\\\]" 1 } } */
 
 STORE_TESTS (32);
 /* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
@@ -40,12 +26,14 @@  STORE_TESTS (32);
 /* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
 /* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
 /* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\twzr, \\\[x\[0-9\]+, 16\\\]" 1 } } */
 
 STORE_TESTS (64);
 /* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
 /* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
 /* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */
 /* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\txzr, \\\[x\[0-9\]+, 32\\\]" 1 } } */
 
 void
 foo_toolarge_offset (int64_t *atomic_vals)
@@ -64,12 +52,20 @@  foo_negative (int8_t *atomic_vals)
 }
 /* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 { target { ! ilp32 } } } } */
 
-#pragma GCC target ("arch=armv8.3-a")
 void
+__attribute__ ((target ("arch=armv8.3-a")))
 foo_older_arch (int64_t *atomic_vals)
 {
   atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
 }
-
 /* Three times, one for each of the three above functions.  */
 /* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 } } */
+
+/* This test is to show that the -matomic-128bit-instructions flag is needed
+ * to handle 128 bit atomic store.  */
+typedef __int128 int128_t;
+STORE_TESTS (128);
+/* { dg-final { scan-assembler-not "dmb\tish\n\[ \t\]*stp" } } */
+/* { dg-final { scan-assembler-not "stxp" } } */
+/* { dg-final { scan-assembler-not "stlxp" } } */
+/* { dg-final { scan-assembler-times "bl?\t__atomic_store_16" 6 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.x b/gcc/testsuite/gcc.target/aarch64/atomic-store.x
new file mode 100644
index 0000000000000000000000000000000000000000..5e6261a8d3ec3905b4a850cd33dbd1caa37a186e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.x
@@ -0,0 +1,20 @@ 
+#include <stdatomic.h>
+
+typedef __INT8_TYPE__ int8_t;
+typedef __INT16_TYPE__ int16_t;
+typedef __INT32_TYPE__ int32_t;
+typedef __INT64_TYPE__ int64_t;
+
+#define STORE_TESTS_NAMED(size, prefix) \
+void \
+prefix##size (int##size##_t *atomic_vals) \
+{ \
+  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
+  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
+  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
+  atomic_store ((atomic_vals + 2), 2); \
+  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
+  atomic_store_explicit (atomic_vals + 4, 0, memory_order_release); \
+}
+
+#define STORE_TESTS(size) STORE_TESTS_NAMED(size, foo)
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store128.c b/gcc/testsuite/gcc.target/aarch64/atomic-store128.c
new file mode 100644
index 0000000000000000000000000000000000000000..bed864205128e2d8b6deb856d061ad13667cb14b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-store128.c
@@ -0,0 +1,74 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.4-a -O2 -matomic-128bit-instructions" } */
+
+#include "atomic-store.x"
+
+STORE_TESTS (8);
+/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1  { target { ! ilp32 } } } } */
+/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 2  { target { ilp32 } } } } */
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurb\twzr, \\\[x\[0-9\]+, 4\\\]" 1 } } */
+
+STORE_TESTS (16);
+/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\twzr, \\\[x\[0-9\]+, 8\\\]" 1 } } */
+
+STORE_TESTS (32);
+/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\twzr, \\\[x\[0-9\]+, 16\\\]" 1 } } */
+
+STORE_TESTS (64);
+/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\txzr, \\\[x\[0-9\]+, 32\\\]" 1 } } */
+
+void
+foo_toolarge_offset (int64_t *atomic_vals)
+{
+  /* 9bit signed unscaled immediate =>
+	largest representable value +255.
+	smallest representable value -256.  */
+  atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
+  atomic_store_explicit (atomic_vals - 33, 2, memory_order_release);
+}
+
+void
+foo_negative (int8_t *atomic_vals)
+{
+  atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);
+}
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 { target { ! ilp32 } } } } */
+
+void
+__attribute__ ((target ("arch=armv8.3-a")))
+foo_older_arch (int64_t *atomic_vals)
+{
+  atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
+}
+/* Three times, one for each of the three above functions.  */
+/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 } } */
+typedef __int128 int128_t;
+STORE_TESTS (128);
+/* { dg-final { scan-assembler-times "dmb\tish\n\[ \t\]*stp" 4 } } */
+/* { dg-final { scan-assembler-times "stp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 5 } } */
+/* { dg-final { scan-assembler-times "stp\txzr, xzr, \\\[x" 1 } } */
+
+
+__attribute__ ((target ("arch=armv8.3-a")))
+STORE_TESTS_NAMED (128, bar);
+/* { dg-final { scan-assembler-times "stxp\tw\[0-9\]+, x\[0-9\]+, x\[0-9\]+, \\\[x" 2 } } */
+/* { dg-final { scan-assembler-times "stlxp\tw\[0-9\]+, x\[0-9\]+, x\[0-9\]+, \\\[x" 3 } } */
+/* { dg-final { scan-assembler-times "stlxp\tw\[0-9\]+, xzr, xzr, \\\[x" 1 } } */