diff mbox series

[AARCH64] enable STLUR use: Use STLUR in atomic_store

Message ID DB6PR0801MB2006AC17A55798788991F8FEE01D0@DB6PR0801MB2006.eurprd08.prod.outlook.com
State New
Headers show
Series [AARCH64] enable STLUR use: Use STLUR in atomic_store | expand

Commit Message

Matthew Malcomson Sept. 18, 2018, 9:15 a.m. UTC
[PATCH][GCC][AARCH64] Use STLUR for atomic_store

Use the STLUR instruction introduced in Armv8.4-a.
This instruction has the store-release semantic like STLR but can take a
9-bit unscaled signed immediate offset.

Example test case:
```
void
foo ()
{
    int32_t *atomic_vals = calloc (4, sizeof (int32_t));
    atomic_store_explicit (atomic_vals + 1, 2, memory_order_release);
}
```

Before patch generates
```
foo:
	stp	x29, x30, [sp, -16]!
	mov	x1, 4
	mov	x0, x1
	mov	x29, sp
	bl	calloc
	mov	w1, 2
	add	x0, x0, 4
	stlr	w1, [x0]
	ldp	x29, x30, [sp], 16
	ret
```

After patch generates
```
foo:
	stp	x29, x30, [sp, -16]!
	mov	x1, 4
	mov	x0, x1
	mov	x29, sp
	bl	calloc
	mov	w1, 2
	stlur	w1, [x0, 4]
	ldp	x29, x30, [sp], 16
	ret
```

We introduce a new feature flag to indicate the presence of this instruction.
The feature flag is called AARCH64_ISA_RCPC8_4 and is included when targeting
armv8.4 architecture.

We also introduce an "arch" attribute to be checked called "rcpc8_4" after this
feature flag.

Full bootstrap and regression test done on aarch64-none-linux-gnu.
Ok for trunk?

gcc/

2018-09-18  Matthew Malcomson  <matthew.malcomson@arm.com>

	* config/aarch64/aarch64-protos.h
	(aarch64_offset_9bit_signed_unscaled_p): New declaration.
	* config/aarch64/aarch64.md (arches): New "rcpc8_4" attribute value.
	(arch_enabled): Add check for "rcpc8_4" attribute value of "arch".
	* config/aarch64/aarch64.h (AARCH64_FL_RCPC8_4): New bitfield.
	(AARCH64_FL_FOR_ARCH8_4): Include AARCH64_FL_RCPC8_4.
	(AARCH64_FL_PROFILE): Move index so flags are ordered.
	(AARCH64_ISA_RCPC8_4): New flag.
	* config/aarch64/aarch64.c (offset_9bit_signed_unscaled_p): Renamed
	to aarch64_offset_9bit_signed_unscaled_p.
	* config/aarch64/atomics.md (atomic_store<mode>): Allow offset
	and use stlur.
	* config/aarch64/constraints.md (Ust): New constraint.
	* config/aarch64/predicates.md.
	(aarch64_9bit_offset_memory_operand): New predicate.

gcc/testsuite/

2018-09-18  Matthew Malcomson  <matthew.malcomson@arm.com>

	* gcc.target/aarch64/atomic-store.c: New.


###############     Attachment also inlined for ease of reply    ###############
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index ef95fc829b83886e2ff00e4664e31af916e99b0c..7a6254e46893fb36dc2ae57e7cfe78af67fb0e49 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -393,6 +393,7 @@ void aarch64_split_add_offset (scalar_int_mode, rtx, rtx, rtx, rtx, rtx);
 bool aarch64_mov_operand_p (rtx, machine_mode);
 rtx aarch64_reverse_mask (machine_mode, unsigned int);
 bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64);
+bool aarch64_offset_9bit_signed_unscaled_p (machine_mode, poly_int64);
 char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
 char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx);
 char *aarch64_output_sve_inc_dec_immediate (const char *, rtx);
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index c1218503bab19323eee1cca8b7e4bea8fbfcf573..cc21e1656b75b4ed1e94d0eb4b2b3af0039ba47e 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -157,9 +157,10 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_SM4	      (1 << 17)  /* Has ARMv8.4-A SM3 and SM4.  */
 #define AARCH64_FL_SHA3	      (1 << 18)  /* Has ARMv8.4-a SHA3 and SHA512.  */
 #define AARCH64_FL_F16FML     (1 << 19)  /* Has ARMv8.4-a FP16 extensions.  */
+#define AARCH64_FL_RCPC8_4    (1 << 20)  /* Has ARMv8.4-a RCPC extensions.  */
 
 /* Statistical Profiling extensions.  */
-#define AARCH64_FL_PROFILE    (1 << 20)
+#define AARCH64_FL_PROFILE    (1 << 21)
 
 /* Has FP and SIMD.  */
 #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
@@ -178,7 +179,7 @@ extern unsigned aarch64_architecture_version;
   (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3)
 #define AARCH64_FL_FOR_ARCH8_4			\
   (AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4 | AARCH64_FL_F16FML \
-   | AARCH64_FL_DOTPROD)
+   | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4)
 
 /* Macros to test ISA flags.  */
 
@@ -199,6 +200,7 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_ISA_SM4	           (aarch64_isa_flags & AARCH64_FL_SM4)
 #define AARCH64_ISA_SHA3	   (aarch64_isa_flags & AARCH64_FL_SHA3)
 #define AARCH64_ISA_F16FML	   (aarch64_isa_flags & AARCH64_FL_F16FML)
+#define AARCH64_ISA_RCPC8_4	   (aarch64_isa_flags & AARCH64_FL_RCPC8_4)
 
 /* Crypto is an optional extension to AdvSIMD.  */
 #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 0d7ca9998466d8d4f9e79faf451a281f8d154d7d..b1a963689a35d406bf383ea7f90c8c2087be7c0a 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -4490,9 +4490,9 @@ aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
 
 /* Return true if OFFSET is a signed 9-bit value.  */
 
-static inline bool
-offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
-			       poly_int64 offset)
+bool
+aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
+				       poly_int64 offset)
 {
   HOST_WIDE_INT const_offset;
   return (offset.is_constant (&const_offset)
@@ -5767,7 +5767,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
 	     instruction memory accesses.  */
 	  if (mode == TImode || mode == TFmode)
 	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
-		    && (offset_9bit_signed_unscaled_p (mode, offset)
+		    && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
 			|| offset_12bit_unsigned_scaled_p (mode, offset)));
 
 	  /* A 7bit offset check because OImode will emit a ldp/stp
@@ -5781,7 +5781,8 @@ aarch64_classify_address (struct aarch64_address_info *info,
 	     ldr/str instructions (only big endian will get here).  */
 	  if (mode == CImode)
 	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
-		    && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
+		    && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
+							       offset + 32)
 			|| offset_12bit_unsigned_scaled_p (V16QImode,
 							   offset + 32)));
 
@@ -5821,7 +5822,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
 		     || known_eq (GET_MODE_SIZE (mode), 16))
 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
 	  else
-	    return (offset_9bit_signed_unscaled_p (mode, offset)
+	    return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
 		    || offset_12bit_unsigned_scaled_p (mode, offset));
 	}
 
@@ -5874,7 +5875,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
 	   */
 	  if (mode == TImode || mode == TFmode)
 	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
-		    && offset_9bit_signed_unscaled_p (mode, offset));
+		    && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
 
 	  if (load_store_pair_p)
 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
@@ -5882,7 +5883,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
 		     || known_eq (GET_MODE_SIZE (mode), 16))
 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
 	  else
-	    return offset_9bit_signed_unscaled_p (mode, offset);
+	    return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
 	}
       return false;
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 955769a64d2030839cdb337321a808626188190e..7158bf0f2efdfb00763af13ac29c54a6723f19fa 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -263,7 +263,7 @@
 ;; alternative). This attribute is used to compute attribute "enabled", use type
 ;; "any" to enable an alternative in all cases.
 
-(define_enum "arches" [ any fp simd sve fp16])
+(define_enum "arches" [ any rcpc8_4 fp simd sve fp16])
 
 (define_enum_attr "arch" "arches" (const_string "any"))
 
@@ -285,6 +285,9 @@
     (ior
 	(eq_attr "arch" "any")
 
+	(and (eq_attr "arch" "rcpc8_4")
+	     (match_test "AARCH64_ISA_RCPC8_4"))
+
 	(and (eq_attr "arch" "fp")
 	     (match_test "TARGET_FLOAT"))
 
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 36c06756a1f94cadae097b3aad654fbeba1cf2f3..73078e412d01a43c05195f01488b95a2bc7a20ec 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -481,9 +481,9 @@
 )
 
 (define_insn "atomic_store<mode>"
-  [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q")
+  [(set (match_operand:ALLI 0 "aarch64_9bit_offset_memory_operand" "=Q,Ust")
     (unspec_volatile:ALLI
-      [(match_operand:ALLI 1 "general_operand" "rZ")
+      [(match_operand:ALLI 1 "general_operand" "rZ,rZ")
        (match_operand:SI 2 "const_int_operand")]			;; model
       UNSPECV_STL))]
   ""
@@ -491,9 +491,12 @@
     enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
     if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
       return "str<atomic_sfx>\t%<w>1, %0";
-    else
+    else if (which_alternative == 0)
       return "stlr<atomic_sfx>\t%<w>1, %0";
+    else
+      return "stlur<atomic_sfx>\t%<w>1, %0";
   }
+  [(set_attr "arch" "*,rcpc8_4")]
 )
 
 (define_insn "@aarch64_load_exclusive<mode>"
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 72cacdabdac52dcb40b480f7a5bfbf4997c742d8..809b35e5fd377a8c6245138e0639c3afc41c7c13 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -218,6 +218,11 @@
  (and (match_code "mem")
       (match_test "REG_P (XEXP (op, 0))")))
 
+(define_memory_constraint "Ust"
+  "@internal
+  A memory address with 9bit unscaled offset."
+  (match_operand 0 "aarch64_9bit_offset_memory_operand"))
+
 (define_memory_constraint "Ump"
   "@internal
   A memory address suitable for a load/store pair operation."
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index d8f377b9603e76a29dd92f95e9905121eaf7b800..8016344f0e79bf881bfbe37547f115d094a66d0a 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -359,6 +359,31 @@
   (and (match_operand 0 "memory_operand")
        (match_code "reg" "0")))
 
+(define_predicate "aarch64_9bit_offset_memory_operand"
+  (and (match_operand 0 "memory_operand")
+       (ior (match_code "reg" "0")
+	    (and (match_code "plus" "0")
+		 (match_code "reg"  "00")
+		 (match_code "const_int" "01"))))
+{
+  rtx mem_op = XEXP (op, 0);
+
+  if (REG_P (mem_op))
+    return GET_MODE (mem_op) == DImode;
+
+  rtx plus_op0 = XEXP (mem_op, 0);
+  rtx plus_op1 = XEXP (mem_op, 1);
+
+  if (GET_MODE (plus_op0) != DImode)
+    return false;
+
+  poly_int64 offset;
+  if (!poly_int_rtx_p (plus_op1, &offset))
+    gcc_unreachable ();
+
+  return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
+})
+
 ;; Predicates for parallel expanders based on mode.
 (define_special_predicate "vect_par_cnst_hi_half"
   (match_code "parallel")
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
new file mode 100644
index 0000000000000000000000000000000000000000..8cabc05b0d739dbfdcecf681348b62634fcfc9a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
@@ -0,0 +1,75 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.4-a -O2" } */
+
+#include <stdatomic.h>
+
+typedef __INT8_TYPE__ int8_t;
+typedef __INT16_TYPE__ int16_t;
+typedef __INT32_TYPE__ int32_t;
+typedef __INT64_TYPE__ int64_t;
+
+#define STORE_TESTS(size) \
+  void \
+  foo##size (int##size##_t *atomic_vals) \
+{ \
+  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
+  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
+  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
+  atomic_store ((atomic_vals + 2), 2); \
+  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
+}
+
+STORE_TESTS (8);
+/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1  { target { ! ilp32 } } } } */
+/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 2  { target { ilp32 } } } } */
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */
+
+STORE_TESTS (16);
+/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */
+
+STORE_TESTS (32);
+/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */
+
+STORE_TESTS (64);
+/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */
+
+void
+foo_toolarge_offset (int64_t *atomic_vals)
+{
+  /* 9bit signed unscaled immediate =>
+	largest representable value +255.
+	smallest representable value -256.  */
+  atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
+  atomic_store_explicit (atomic_vals - 33, 2, memory_order_release);
+}
+
+void
+foo_negative (int8_t *atomic_vals)
+{
+  atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);
+}
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 { target { ! ilp32 } } } } */
+
+#pragma GCC target ("arch=armv8.3-a")
+void
+foo_older_arch (int64_t *atomic_vals)
+{
+  atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
+}
+
+/* Three times, one for each of the three above functions.  */
+/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 } } */

Comments

Richard Earnshaw (lists) Sept. 18, 2018, 2:10 p.m. UTC | #1
On 18/09/18 10:15, Matthew Malcomson wrote:
> [PATCH][GCC][AARCH64] Use STLUR for atomic_store
> 
> Use the STLUR instruction introduced in Armv8.4-a.
> This instruction has the store-release semantic like STLR but can take a
> 9-bit unscaled signed immediate offset.
> 
> Example test case:
> ```
> void
> foo ()
> {
>     int32_t *atomic_vals = calloc (4, sizeof (int32_t));
>     atomic_store_explicit (atomic_vals + 1, 2, memory_order_release);
> }
> ```
> 
> Before patch generates
> ```
> foo:
> 	stp	x29, x30, [sp, -16]!
> 	mov	x1, 4
> 	mov	x0, x1
> 	mov	x29, sp
> 	bl	calloc
> 	mov	w1, 2
> 	add	x0, x0, 4
> 	stlr	w1, [x0]
> 	ldp	x29, x30, [sp], 16
> 	ret
> ```
> 
> After patch generates
> ```
> foo:
> 	stp	x29, x30, [sp, -16]!
> 	mov	x1, 4
> 	mov	x0, x1
> 	mov	x29, sp
> 	bl	calloc
> 	mov	w1, 2
> 	stlur	w1, [x0, 4]
> 	ldp	x29, x30, [sp], 16
> 	ret
> ```
> 
> We introduce a new feature flag to indicate the presence of this instruction.
> The feature flag is called AARCH64_ISA_RCPC8_4 and is included when targeting
> armv8.4 architecture.
> 
> We also introduce an "arch" attribute to be checked called "rcpc8_4" after this
> feature flag.
> 
> Full bootstrap and regression test done on aarch64-none-linux-gnu.
> Ok for trunk?
> 
> gcc/
> 
> 2018-09-18  Matthew Malcomson  <matthew.malcomson@arm.com>
> 
> 	* config/aarch64/aarch64-protos.h
> 	(aarch64_offset_9bit_signed_unscaled_p): New declaration.
> 	* config/aarch64/aarch64.md (arches): New "rcpc8_4" attribute value.
> 	(arch_enabled): Add check for "rcpc8_4" attribute value of "arch".
> 	* config/aarch64/aarch64.h (AARCH64_FL_RCPC8_4): New bitfield.
> 	(AARCH64_FL_FOR_ARCH8_4): Include AARCH64_FL_RCPC8_4.
> 	(AARCH64_FL_PROFILE): Move index so flags are ordered.
> 	(AARCH64_ISA_RCPC8_4): New flag.
> 	* config/aarch64/aarch64.c (offset_9bit_signed_unscaled_p): Renamed
> 	to aarch64_offset_9bit_signed_unscaled_p.
> 	* config/aarch64/atomics.md (atomic_store<mode>): Allow offset
> 	and use stlur.
> 	* config/aarch64/constraints.md (Ust): New constraint.
> 	* config/aarch64/predicates.md.
> 	(aarch64_9bit_offset_memory_operand): New predicate.
> 
> gcc/testsuite/
> 
> 2018-09-18  Matthew Malcomson  <matthew.malcomson@arm.com>
> 
> 	* gcc.target/aarch64/atomic-store.c: New.
> 
> 
> ###############     Attachment also inlined for ease of reply    ###############
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index ef95fc829b83886e2ff00e4664e31af916e99b0c..7a6254e46893fb36dc2ae57e7cfe78af67fb0e49 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -393,6 +393,7 @@ void aarch64_split_add_offset (scalar_int_mode, rtx, rtx, rtx, rtx, rtx);
>  bool aarch64_mov_operand_p (rtx, machine_mode);
>  rtx aarch64_reverse_mask (machine_mode, unsigned int);
>  bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64);
> +bool aarch64_offset_9bit_signed_unscaled_p (machine_mode, poly_int64);
>  char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
>  char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx);
>  char *aarch64_output_sve_inc_dec_immediate (const char *, rtx);
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index c1218503bab19323eee1cca8b7e4bea8fbfcf573..cc21e1656b75b4ed1e94d0eb4b2b3af0039ba47e 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -157,9 +157,10 @@ extern unsigned aarch64_architecture_version;
>  #define AARCH64_FL_SM4	      (1 << 17)  /* Has ARMv8.4-A SM3 and SM4.  */
>  #define AARCH64_FL_SHA3	      (1 << 18)  /* Has ARMv8.4-a SHA3 and SHA512.  */
>  #define AARCH64_FL_F16FML     (1 << 19)  /* Has ARMv8.4-a FP16 extensions.  */
> +#define AARCH64_FL_RCPC8_4    (1 << 20)  /* Has ARMv8.4-a RCPC extensions.  */
>  
>  /* Statistical Profiling extensions.  */
> -#define AARCH64_FL_PROFILE    (1 << 20)
> +#define AARCH64_FL_PROFILE    (1 << 21)
>  
>  /* Has FP and SIMD.  */
>  #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
> @@ -178,7 +179,7 @@ extern unsigned aarch64_architecture_version;
>    (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3)
>  #define AARCH64_FL_FOR_ARCH8_4			\
>    (AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4 | AARCH64_FL_F16FML \
> -   | AARCH64_FL_DOTPROD)
> +   | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4)
>  
>  /* Macros to test ISA flags.  */
>  
> @@ -199,6 +200,7 @@ extern unsigned aarch64_architecture_version;
>  #define AARCH64_ISA_SM4	           (aarch64_isa_flags & AARCH64_FL_SM4)
>  #define AARCH64_ISA_SHA3	   (aarch64_isa_flags & AARCH64_FL_SHA3)
>  #define AARCH64_ISA_F16FML	   (aarch64_isa_flags & AARCH64_FL_F16FML)
> +#define AARCH64_ISA_RCPC8_4	   (aarch64_isa_flags & AARCH64_FL_RCPC8_4)
>  
>  /* Crypto is an optional extension to AdvSIMD.  */
>  #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 0d7ca9998466d8d4f9e79faf451a281f8d154d7d..b1a963689a35d406bf383ea7f90c8c2087be7c0a 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -4490,9 +4490,9 @@ aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
>  
>  /* Return true if OFFSET is a signed 9-bit value.  */
>  
> -static inline bool
> -offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
> -			       poly_int64 offset)
> +bool
> +aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
> +				       poly_int64 offset)
>  {
>    HOST_WIDE_INT const_offset;
>    return (offset.is_constant (&const_offset)
> @@ -5767,7 +5767,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
>  	     instruction memory accesses.  */
>  	  if (mode == TImode || mode == TFmode)
>  	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
> -		    && (offset_9bit_signed_unscaled_p (mode, offset)
> +		    && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
>  			|| offset_12bit_unsigned_scaled_p (mode, offset)));
>  
>  	  /* A 7bit offset check because OImode will emit a ldp/stp
> @@ -5781,7 +5781,8 @@ aarch64_classify_address (struct aarch64_address_info *info,
>  	     ldr/str instructions (only big endian will get here).  */
>  	  if (mode == CImode)
>  	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
> -		    && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
> +		    && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
> +							       offset + 32)
>  			|| offset_12bit_unsigned_scaled_p (V16QImode,
>  							   offset + 32)));
>  
> @@ -5821,7 +5822,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
>  		     || known_eq (GET_MODE_SIZE (mode), 16))
>  		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
>  	  else
> -	    return (offset_9bit_signed_unscaled_p (mode, offset)
> +	    return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
>  		    || offset_12bit_unsigned_scaled_p (mode, offset));
>  	}
>  
> @@ -5874,7 +5875,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
>  	   */
>  	  if (mode == TImode || mode == TFmode)
>  	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
> -		    && offset_9bit_signed_unscaled_p (mode, offset));
> +		    && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
>  
>  	  if (load_store_pair_p)
>  	    return ((known_eq (GET_MODE_SIZE (mode), 4)
> @@ -5882,7 +5883,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
>  		     || known_eq (GET_MODE_SIZE (mode), 16))
>  		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
>  	  else
> -	    return offset_9bit_signed_unscaled_p (mode, offset);
> +	    return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
>  	}
>        return false;
>  
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 955769a64d2030839cdb337321a808626188190e..7158bf0f2efdfb00763af13ac29c54a6723f19fa 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -263,7 +263,7 @@
>  ;; alternative). This attribute is used to compute attribute "enabled", use type
>  ;; "any" to enable an alternative in all cases.
>  
> -(define_enum "arches" [ any fp simd sve fp16])
> +(define_enum "arches" [ any rcpc8_4 fp simd sve fp16])
>  
>  (define_enum_attr "arch" "arches" (const_string "any"))
>  
> @@ -285,6 +285,9 @@
>      (ior
>  	(eq_attr "arch" "any")
>  
> +	(and (eq_attr "arch" "rcpc8_4")
> +	     (match_test "AARCH64_ISA_RCPC8_4"))
> +
>  	(and (eq_attr "arch" "fp")
>  	     (match_test "TARGET_FLOAT"))
>  
> diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
> index 36c06756a1f94cadae097b3aad654fbeba1cf2f3..73078e412d01a43c05195f01488b95a2bc7a20ec 100644
> --- a/gcc/config/aarch64/atomics.md
> +++ b/gcc/config/aarch64/atomics.md
> @@ -481,9 +481,9 @@
>  )
>  
>  (define_insn "atomic_store<mode>"
> -  [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q")
> +  [(set (match_operand:ALLI 0 "aarch64_9bit_offset_memory_operand" "=Q,Ust")

This is less than ideal because on earlier architectures the predicate
will allow the offset variants but register allocation will then have to
undo that to match the first alternative.

I think what we should do is define a wrapped variant of
aarch64_9bit_offset_memory_operand which uses that function but only
allows the offset when RCPC8_4 is enabled.

Something like

aarch64_rcpc_memory_operand (...)
{
  if (TARGET_RCPC8_4)
    return aarch64_9bit_offset_memory_operand (...);
  return aarch64_sync_memory_operand (...);
}

OK with that change.

R.

>      (unspec_volatile:ALLI
> -      [(match_operand:ALLI 1 "general_operand" "rZ")
> +      [(match_operand:ALLI 1 "general_operand" "rZ,rZ")
>         (match_operand:SI 2 "const_int_operand")]			;; model
>        UNSPECV_STL))]
>    ""
> @@ -491,9 +491,12 @@
>      enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
>      if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
>        return "str<atomic_sfx>\t%<w>1, %0";
> -    else
> +    else if (which_alternative == 0)
>        return "stlr<atomic_sfx>\t%<w>1, %0";
> +    else
> +      return "stlur<atomic_sfx>\t%<w>1, %0";
>    }
> +  [(set_attr "arch" "*,rcpc8_4")]
>  )
>  
>  (define_insn "@aarch64_load_exclusive<mode>"
> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
> index 72cacdabdac52dcb40b480f7a5bfbf4997c742d8..809b35e5fd377a8c6245138e0639c3afc41c7c13 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -218,6 +218,11 @@
>   (and (match_code "mem")
>        (match_test "REG_P (XEXP (op, 0))")))
>  
> +(define_memory_constraint "Ust"
> +  "@internal
> +  A memory address with 9bit unscaled offset."
> +  (match_operand 0 "aarch64_9bit_offset_memory_operand"))
> +
>  (define_memory_constraint "Ump"
>    "@internal
>    A memory address suitable for a load/store pair operation."
> diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
> index d8f377b9603e76a29dd92f95e9905121eaf7b800..8016344f0e79bf881bfbe37547f115d094a66d0a 100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -359,6 +359,31 @@
>    (and (match_operand 0 "memory_operand")
>         (match_code "reg" "0")))
>  
> +(define_predicate "aarch64_9bit_offset_memory_operand"
> +  (and (match_operand 0 "memory_operand")
> +       (ior (match_code "reg" "0")
> +	    (and (match_code "plus" "0")
> +		 (match_code "reg"  "00")
> +		 (match_code "const_int" "01"))))
> +{
> +  rtx mem_op = XEXP (op, 0);
> +
> +  if (REG_P (mem_op))
> +    return GET_MODE (mem_op) == DImode;
> +
> +  rtx plus_op0 = XEXP (mem_op, 0);
> +  rtx plus_op1 = XEXP (mem_op, 1);
> +
> +  if (GET_MODE (plus_op0) != DImode)
> +    return false;
> +
> +  poly_int64 offset;
> +  if (!poly_int_rtx_p (plus_op1, &offset))
> +    gcc_unreachable ();
> +
> +  return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
> +})
> +
>  ;; Predicates for parallel expanders based on mode.
>  (define_special_predicate "vect_par_cnst_hi_half"
>    (match_code "parallel")
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..8cabc05b0d739dbfdcecf681348b62634fcfc9a4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> @@ -0,0 +1,75 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=armv8.4-a -O2" } */
> +
> +#include <stdatomic.h>
> +
> +typedef __INT8_TYPE__ int8_t;
> +typedef __INT16_TYPE__ int16_t;
> +typedef __INT32_TYPE__ int32_t;
> +typedef __INT64_TYPE__ int64_t;
> +
> +#define STORE_TESTS(size) \
> +  void \
> +  foo##size (int##size##_t *atomic_vals) \
> +{ \
> +  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
> +  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
> +  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
> +  atomic_store ((atomic_vals + 2), 2); \
> +  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
> +}
> +
> +STORE_TESTS (8);
> +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1  { target { ! ilp32 } } } } */
> +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 2  { target { ilp32 } } } } */
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */
> +
> +STORE_TESTS (16);
> +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */
> +
> +STORE_TESTS (32);
> +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */
> +
> +STORE_TESTS (64);
> +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */
> +
> +void
> +foo_toolarge_offset (int64_t *atomic_vals)
> +{
> +  /* 9bit signed unscaled immediate =>
> +	largest representable value +255.
> +	smallest representable value -256.  */
> +  atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
> +  atomic_store_explicit (atomic_vals - 33, 2, memory_order_release);
> +}
> +
> +void
> +foo_negative (int8_t *atomic_vals)
> +{
> +  atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);
> +}
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 { target { ! ilp32 } } } } */
> +
> +#pragma GCC target ("arch=armv8.3-a")
> +void
> +foo_older_arch (int64_t *atomic_vals)
> +{
> +  atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
> +}
> +
> +/* Three times, one for each of the three above functions.  */
> +/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 } } */
> 
> 
> stlur-use.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index ef95fc829b83886e2ff00e4664e31af916e99b0c..7a6254e46893fb36dc2ae57e7cfe78af67fb0e49 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -393,6 +393,7 @@ void aarch64_split_add_offset (scalar_int_mode, rtx, rtx, rtx, rtx, rtx);
>  bool aarch64_mov_operand_p (rtx, machine_mode);
>  rtx aarch64_reverse_mask (machine_mode, unsigned int);
>  bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64);
> +bool aarch64_offset_9bit_signed_unscaled_p (machine_mode, poly_int64);
>  char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
>  char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx);
>  char *aarch64_output_sve_inc_dec_immediate (const char *, rtx);
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index c1218503bab19323eee1cca8b7e4bea8fbfcf573..cc21e1656b75b4ed1e94d0eb4b2b3af0039ba47e 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -157,9 +157,10 @@ extern unsigned aarch64_architecture_version;
>  #define AARCH64_FL_SM4	      (1 << 17)  /* Has ARMv8.4-A SM3 and SM4.  */
>  #define AARCH64_FL_SHA3	      (1 << 18)  /* Has ARMv8.4-a SHA3 and SHA512.  */
>  #define AARCH64_FL_F16FML     (1 << 19)  /* Has ARMv8.4-a FP16 extensions.  */
> +#define AARCH64_FL_RCPC8_4    (1 << 20)  /* Has ARMv8.4-a RCPC extensions.  */
>  
>  /* Statistical Profiling extensions.  */
> -#define AARCH64_FL_PROFILE    (1 << 20)
> +#define AARCH64_FL_PROFILE    (1 << 21)
>  
>  /* Has FP and SIMD.  */
>  #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
> @@ -178,7 +179,7 @@ extern unsigned aarch64_architecture_version;
>    (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3)
>  #define AARCH64_FL_FOR_ARCH8_4			\
>    (AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4 | AARCH64_FL_F16FML \
> -   | AARCH64_FL_DOTPROD)
> +   | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4)
>  
>  /* Macros to test ISA flags.  */
>  
> @@ -199,6 +200,7 @@ extern unsigned aarch64_architecture_version;
>  #define AARCH64_ISA_SM4	           (aarch64_isa_flags & AARCH64_FL_SM4)
>  #define AARCH64_ISA_SHA3	   (aarch64_isa_flags & AARCH64_FL_SHA3)
>  #define AARCH64_ISA_F16FML	   (aarch64_isa_flags & AARCH64_FL_F16FML)
> +#define AARCH64_ISA_RCPC8_4	   (aarch64_isa_flags & AARCH64_FL_RCPC8_4)
>  
>  /* Crypto is an optional extension to AdvSIMD.  */
>  #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 0d7ca9998466d8d4f9e79faf451a281f8d154d7d..b1a963689a35d406bf383ea7f90c8c2087be7c0a 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -4490,9 +4490,9 @@ aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
>  
>  /* Return true if OFFSET is a signed 9-bit value.  */
>  
> -static inline bool
> -offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
> -			       poly_int64 offset)
> +bool
> +aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
> +				       poly_int64 offset)
>  {
>    HOST_WIDE_INT const_offset;
>    return (offset.is_constant (&const_offset)
> @@ -5767,7 +5767,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
>  	     instruction memory accesses.  */
>  	  if (mode == TImode || mode == TFmode)
>  	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
> -		    && (offset_9bit_signed_unscaled_p (mode, offset)
> +		    && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
>  			|| offset_12bit_unsigned_scaled_p (mode, offset)));
>  
>  	  /* A 7bit offset check because OImode will emit a ldp/stp
> @@ -5781,7 +5781,8 @@ aarch64_classify_address (struct aarch64_address_info *info,
>  	     ldr/str instructions (only big endian will get here).  */
>  	  if (mode == CImode)
>  	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
> -		    && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
> +		    && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
> +							       offset + 32)
>  			|| offset_12bit_unsigned_scaled_p (V16QImode,
>  							   offset + 32)));
>  
> @@ -5821,7 +5822,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
>  		     || known_eq (GET_MODE_SIZE (mode), 16))
>  		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
>  	  else
> -	    return (offset_9bit_signed_unscaled_p (mode, offset)
> +	    return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
>  		    || offset_12bit_unsigned_scaled_p (mode, offset));
>  	}
>  
> @@ -5874,7 +5875,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
>  	   */
>  	  if (mode == TImode || mode == TFmode)
>  	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
> -		    && offset_9bit_signed_unscaled_p (mode, offset));
> +		    && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
>  
>  	  if (load_store_pair_p)
>  	    return ((known_eq (GET_MODE_SIZE (mode), 4)
> @@ -5882,7 +5883,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
>  		     || known_eq (GET_MODE_SIZE (mode), 16))
>  		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
>  	  else
> -	    return offset_9bit_signed_unscaled_p (mode, offset);
> +	    return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
>  	}
>        return false;
>  
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 955769a64d2030839cdb337321a808626188190e..7158bf0f2efdfb00763af13ac29c54a6723f19fa 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -263,7 +263,7 @@
>  ;; alternative). This attribute is used to compute attribute "enabled", use type
>  ;; "any" to enable an alternative in all cases.
>  
> -(define_enum "arches" [ any fp simd sve fp16])
> +(define_enum "arches" [ any rcpc8_4 fp simd sve fp16])
>  
>  (define_enum_attr "arch" "arches" (const_string "any"))
>  
> @@ -285,6 +285,9 @@
>      (ior
>  	(eq_attr "arch" "any")
>  
> +	(and (eq_attr "arch" "rcpc8_4")
> +	     (match_test "AARCH64_ISA_RCPC8_4"))
> +
>  	(and (eq_attr "arch" "fp")
>  	     (match_test "TARGET_FLOAT"))
>  
> diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
> index 36c06756a1f94cadae097b3aad654fbeba1cf2f3..73078e412d01a43c05195f01488b95a2bc7a20ec 100644
> --- a/gcc/config/aarch64/atomics.md
> +++ b/gcc/config/aarch64/atomics.md
> @@ -481,9 +481,9 @@
>  )
>  
>  (define_insn "atomic_store<mode>"
> -  [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q")
> +  [(set (match_operand:ALLI 0 "aarch64_9bit_offset_memory_operand" "=Q,Ust")
>      (unspec_volatile:ALLI
> -      [(match_operand:ALLI 1 "general_operand" "rZ")
> +      [(match_operand:ALLI 1 "general_operand" "rZ,rZ")
>         (match_operand:SI 2 "const_int_operand")]			;; model
>        UNSPECV_STL))]
>    ""
> @@ -491,9 +491,12 @@
>      enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
>      if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
>        return "str<atomic_sfx>\t%<w>1, %0";
> -    else
> +    else if (which_alternative == 0)
>        return "stlr<atomic_sfx>\t%<w>1, %0";
> +    else
> +      return "stlur<atomic_sfx>\t%<w>1, %0";
>    }
> +  [(set_attr "arch" "*,rcpc8_4")]
>  )
>  
>  (define_insn "@aarch64_load_exclusive<mode>"
> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
> index 72cacdabdac52dcb40b480f7a5bfbf4997c742d8..809b35e5fd377a8c6245138e0639c3afc41c7c13 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -218,6 +218,11 @@
>   (and (match_code "mem")
>        (match_test "REG_P (XEXP (op, 0))")))
>  
> +(define_memory_constraint "Ust"
> +  "@internal
> +  A memory address with 9bit unscaled offset."
> +  (match_operand 0 "aarch64_9bit_offset_memory_operand"))
> +
>  (define_memory_constraint "Ump"
>    "@internal
>    A memory address suitable for a load/store pair operation."
> diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
> index d8f377b9603e76a29dd92f95e9905121eaf7b800..8016344f0e79bf881bfbe37547f115d094a66d0a 100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -359,6 +359,31 @@
>    (and (match_operand 0 "memory_operand")
>         (match_code "reg" "0")))
>  
> +(define_predicate "aarch64_9bit_offset_memory_operand"
> +  (and (match_operand 0 "memory_operand")
> +       (ior (match_code "reg" "0")
> +	    (and (match_code "plus" "0")
> +		 (match_code "reg"  "00")
> +		 (match_code "const_int" "01"))))
> +{
> +  rtx mem_op = XEXP (op, 0);
> +
> +  if (REG_P (mem_op))
> +    return GET_MODE (mem_op) == DImode;
> +
> +  rtx plus_op0 = XEXP (mem_op, 0);
> +  rtx plus_op1 = XEXP (mem_op, 1);
> +
> +  if (GET_MODE (plus_op0) != DImode)
> +    return false;
> +
> +  poly_int64 offset;
> +  if (!poly_int_rtx_p (plus_op1, &offset))
> +    gcc_unreachable ();
> +
> +  return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
> +})
> +
>  ;; Predicates for parallel expanders based on mode.
>  (define_special_predicate "vect_par_cnst_hi_half"
>    (match_code "parallel")
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..8cabc05b0d739dbfdcecf681348b62634fcfc9a4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> @@ -0,0 +1,75 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=armv8.4-a -O2" } */
> +
> +#include <stdatomic.h>
> +
> +typedef __INT8_TYPE__ int8_t;
> +typedef __INT16_TYPE__ int16_t;
> +typedef __INT32_TYPE__ int32_t;
> +typedef __INT64_TYPE__ int64_t;
> +
> +#define STORE_TESTS(size) \
> +  void \
> +  foo##size (int##size##_t *atomic_vals) \
> +{ \
> +  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
> +  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
> +  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
> +  atomic_store ((atomic_vals + 2), 2); \
> +  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
> +}
> +
> +STORE_TESTS (8);
> +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1  { target { ! ilp32 } } } } */
> +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 2  { target { ilp32 } } } } */
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */
> +
> +STORE_TESTS (16);
> +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */
> +
> +STORE_TESTS (32);
> +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */
> +
> +STORE_TESTS (64);
> +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */
> +
> +void
> +foo_toolarge_offset (int64_t *atomic_vals)
> +{
> +  /* 9bit signed unscaled immediate =>
> +	largest representable value +255.
> +	smallest representable value -256.  */
> +  atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
> +  atomic_store_explicit (atomic_vals - 33, 2, memory_order_release);
> +}
> +
> +void
> +foo_negative (int8_t *atomic_vals)
> +{
> +  atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);
> +}
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 { target { ! ilp32 } } } } */
> +
> +#pragma GCC target ("arch=armv8.3-a")
> +void
> +foo_older_arch (int64_t *atomic_vals)
> +{
> +  atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
> +}
> +
> +/* Three times, one for each of the three above functions.  */
> +/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 } } */
>
Matthew Malcomson Sept. 18, 2018, 3:36 p.m. UTC | #2
>> diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
>> index 36c06756a1f94cadae097b3aad654fbeba1cf2f3..73078e412d01a43c05195f01488b95a2bc7a20ec 100644
>> --- a/gcc/config/aarch64/atomics.md
>> +++ b/gcc/config/aarch64/atomics.md
>> @@ -481,9 +481,9 @@
>>   )
>>   
>>   (define_insn "atomic_store<mode>"
>> -  [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q")
>> +  [(set (match_operand:ALLI 0 "aarch64_9bit_offset_memory_operand" "=Q,Ust")
> This is less than ideal because on earlier architectures the predicate
> will allow the offset variants but register allocation will then have to
> undo that to match the first alternative.
>
> I think what we should do is define a wrapped variant of
> aarch64_9bit_offset_memory_operand which uses that function but only
> allows the offset when RCPC8_4 is enabled.
>
> Something like
>
> aarch64_rcpc_memory_operand (...)
> {
>    if (TARGET_RCPC8_4)
>      return aarch64_9bit_offset_memory_operand (...);
>    return aarch64_sync_memory_operand (...);
> }
>
> OK with that change.
>
> R.
>
>

Is defining that in the predicates.md file like the below OK?
  (just to keep the new predicates together and so it can be found in 
predicates.md)


(define_predicate "aarch64_rcpc_memory_operand"
   (if_then_else (match_test "AARCH64_ISA_RCPC8_4")
     (match_operand 0 "aarch64_9bit_offset_memory_operand")
     (match_operand 0 "aarch64_sync_memory_operand")))
Richard Earnshaw (lists) Sept. 18, 2018, 3:38 p.m. UTC | #3
On 18/09/18 16:36, Matthew Malcomson wrote:
> 
>>> diff --git a/gcc/config/aarch64/atomics.md
>>> b/gcc/config/aarch64/atomics.md
>>> index
>>> 36c06756a1f94cadae097b3aad654fbeba1cf2f3..73078e412d01a43c05195f01488b95a2bc7a20ec
>>> 100644
>>> --- a/gcc/config/aarch64/atomics.md
>>> +++ b/gcc/config/aarch64/atomics.md
>>> @@ -481,9 +481,9 @@
>>>   )
>>>     (define_insn "atomic_store<mode>"
>>> -  [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q")
>>> +  [(set (match_operand:ALLI 0 "aarch64_9bit_offset_memory_operand"
>>> "=Q,Ust")
>> This is less than ideal because on earlier architectures the predicate
>> will allow the offset variants but register allocation will then have to
>> undo that to match the first alternative.
>>
>> I think what we should do is define a wrapped variant of
>> aarch64_9bit_offset_memory_operand which uses that function but only
>> allows the offset when RCPC8_4 is enabled.
>>
>> Something like
>>
>> aarch64_rcpc_memory_operand (...)
>> {
>>    if (TARGET_RCPC8_4)
>>      return aarch64_9bit_offset_memory_operand (...);
>>    return aarch64_sync_memory_operand (...);
>> }
>>
>> OK with that change.
>>
>> R.
>>
>>
> 
> Is defining that in the predicates.md file like the below OK?
>  (just to keep the new predicates together and so it can be found in
> predicates.md)
> 
> 
> (define_predicate "aarch64_rcpc_memory_operand"
>   (if_then_else (match_test "AARCH64_ISA_RCPC8_4")
>     (match_operand 0 "aarch64_9bit_offset_memory_operand")
>     (match_operand 0 "aarch64_sync_memory_operand")))
> 
> 
> 

Sure.

R.
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index ef95fc829b83886e2ff00e4664e31af916e99b0c..7a6254e46893fb36dc2ae57e7cfe78af67fb0e49 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -393,6 +393,7 @@  void aarch64_split_add_offset (scalar_int_mode, rtx, rtx, rtx, rtx, rtx);
 bool aarch64_mov_operand_p (rtx, machine_mode);
 rtx aarch64_reverse_mask (machine_mode, unsigned int);
 bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64);
+bool aarch64_offset_9bit_signed_unscaled_p (machine_mode, poly_int64);
 char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
 char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx);
 char *aarch64_output_sve_inc_dec_immediate (const char *, rtx);
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index c1218503bab19323eee1cca8b7e4bea8fbfcf573..cc21e1656b75b4ed1e94d0eb4b2b3af0039ba47e 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -157,9 +157,10 @@  extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_SM4	      (1 << 17)  /* Has ARMv8.4-A SM3 and SM4.  */
 #define AARCH64_FL_SHA3	      (1 << 18)  /* Has ARMv8.4-a SHA3 and SHA512.  */
 #define AARCH64_FL_F16FML     (1 << 19)  /* Has ARMv8.4-a FP16 extensions.  */
+#define AARCH64_FL_RCPC8_4    (1 << 20)  /* Has ARMv8.4-a RCPC extensions.  */
 
 /* Statistical Profiling extensions.  */
-#define AARCH64_FL_PROFILE    (1 << 20)
+#define AARCH64_FL_PROFILE    (1 << 21)
 
 /* Has FP and SIMD.  */
 #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
@@ -178,7 +179,7 @@  extern unsigned aarch64_architecture_version;
   (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3)
 #define AARCH64_FL_FOR_ARCH8_4			\
   (AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4 | AARCH64_FL_F16FML \
-   | AARCH64_FL_DOTPROD)
+   | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4)
 
 /* Macros to test ISA flags.  */
 
@@ -199,6 +200,7 @@  extern unsigned aarch64_architecture_version;
 #define AARCH64_ISA_SM4	           (aarch64_isa_flags & AARCH64_FL_SM4)
 #define AARCH64_ISA_SHA3	   (aarch64_isa_flags & AARCH64_FL_SHA3)
 #define AARCH64_ISA_F16FML	   (aarch64_isa_flags & AARCH64_FL_F16FML)
+#define AARCH64_ISA_RCPC8_4	   (aarch64_isa_flags & AARCH64_FL_RCPC8_4)
 
 /* Crypto is an optional extension to AdvSIMD.  */
 #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 0d7ca9998466d8d4f9e79faf451a281f8d154d7d..b1a963689a35d406bf383ea7f90c8c2087be7c0a 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -4490,9 +4490,9 @@  aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
 
 /* Return true if OFFSET is a signed 9-bit value.  */
 
-static inline bool
-offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
-			       poly_int64 offset)
+bool
+aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
+				       poly_int64 offset)
 {
   HOST_WIDE_INT const_offset;
   return (offset.is_constant (&const_offset)
@@ -5767,7 +5767,7 @@  aarch64_classify_address (struct aarch64_address_info *info,
 	     instruction memory accesses.  */
 	  if (mode == TImode || mode == TFmode)
 	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
-		    && (offset_9bit_signed_unscaled_p (mode, offset)
+		    && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
 			|| offset_12bit_unsigned_scaled_p (mode, offset)));
 
 	  /* A 7bit offset check because OImode will emit a ldp/stp
@@ -5781,7 +5781,8 @@  aarch64_classify_address (struct aarch64_address_info *info,
 	     ldr/str instructions (only big endian will get here).  */
 	  if (mode == CImode)
 	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
-		    && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
+		    && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
+							       offset + 32)
 			|| offset_12bit_unsigned_scaled_p (V16QImode,
 							   offset + 32)));
 
@@ -5821,7 +5822,7 @@  aarch64_classify_address (struct aarch64_address_info *info,
 		     || known_eq (GET_MODE_SIZE (mode), 16))
 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
 	  else
-	    return (offset_9bit_signed_unscaled_p (mode, offset)
+	    return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
 		    || offset_12bit_unsigned_scaled_p (mode, offset));
 	}
 
@@ -5874,7 +5875,7 @@  aarch64_classify_address (struct aarch64_address_info *info,
 	   */
 	  if (mode == TImode || mode == TFmode)
 	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
-		    && offset_9bit_signed_unscaled_p (mode, offset));
+		    && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
 
 	  if (load_store_pair_p)
 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
@@ -5882,7 +5883,7 @@  aarch64_classify_address (struct aarch64_address_info *info,
 		     || known_eq (GET_MODE_SIZE (mode), 16))
 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
 	  else
-	    return offset_9bit_signed_unscaled_p (mode, offset);
+	    return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
 	}
       return false;
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 955769a64d2030839cdb337321a808626188190e..7158bf0f2efdfb00763af13ac29c54a6723f19fa 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -263,7 +263,7 @@ 
 ;; alternative). This attribute is used to compute attribute "enabled", use type
 ;; "any" to enable an alternative in all cases.
 
-(define_enum "arches" [ any fp simd sve fp16])
+(define_enum "arches" [ any rcpc8_4 fp simd sve fp16])
 
 (define_enum_attr "arch" "arches" (const_string "any"))
 
@@ -285,6 +285,9 @@ 
     (ior
 	(eq_attr "arch" "any")
 
+	(and (eq_attr "arch" "rcpc8_4")
+	     (match_test "AARCH64_ISA_RCPC8_4"))
+
 	(and (eq_attr "arch" "fp")
 	     (match_test "TARGET_FLOAT"))
 
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 36c06756a1f94cadae097b3aad654fbeba1cf2f3..73078e412d01a43c05195f01488b95a2bc7a20ec 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -481,9 +481,9 @@ 
 )
 
 (define_insn "atomic_store<mode>"
-  [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q")
+  [(set (match_operand:ALLI 0 "aarch64_9bit_offset_memory_operand" "=Q,Ust")
     (unspec_volatile:ALLI
-      [(match_operand:ALLI 1 "general_operand" "rZ")
+      [(match_operand:ALLI 1 "general_operand" "rZ,rZ")
        (match_operand:SI 2 "const_int_operand")]			;; model
       UNSPECV_STL))]
   ""
@@ -491,9 +491,12 @@ 
     enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
     if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
       return "str<atomic_sfx>\t%<w>1, %0";
-    else
+    else if (which_alternative == 0)
       return "stlr<atomic_sfx>\t%<w>1, %0";
+    else
+      return "stlur<atomic_sfx>\t%<w>1, %0";
   }
+  [(set_attr "arch" "*,rcpc8_4")]
 )
 
 (define_insn "@aarch64_load_exclusive<mode>"
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 72cacdabdac52dcb40b480f7a5bfbf4997c742d8..809b35e5fd377a8c6245138e0639c3afc41c7c13 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -218,6 +218,11 @@ 
  (and (match_code "mem")
       (match_test "REG_P (XEXP (op, 0))")))
 
+(define_memory_constraint "Ust"
+  "@internal
+  A memory address with 9bit unscaled offset."
+  (match_operand 0 "aarch64_9bit_offset_memory_operand"))
+
 (define_memory_constraint "Ump"
   "@internal
   A memory address suitable for a load/store pair operation."
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index d8f377b9603e76a29dd92f95e9905121eaf7b800..8016344f0e79bf881bfbe37547f115d094a66d0a 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -359,6 +359,31 @@ 
   (and (match_operand 0 "memory_operand")
        (match_code "reg" "0")))
 
+(define_predicate "aarch64_9bit_offset_memory_operand"
+  (and (match_operand 0 "memory_operand")
+       (ior (match_code "reg" "0")
+	    (and (match_code "plus" "0")
+		 (match_code "reg"  "00")
+		 (match_code "const_int" "01"))))
+{
+  rtx mem_op = XEXP (op, 0);
+
+  if (REG_P (mem_op))
+    return GET_MODE (mem_op) == DImode;
+
+  rtx plus_op0 = XEXP (mem_op, 0);
+  rtx plus_op1 = XEXP (mem_op, 1);
+
+  if (GET_MODE (plus_op0) != DImode)
+    return false;
+
+  poly_int64 offset;
+  if (!poly_int_rtx_p (plus_op1, &offset))
+    gcc_unreachable ();
+
+  return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
+})
+
 ;; Predicates for parallel expanders based on mode.
 (define_special_predicate "vect_par_cnst_hi_half"
   (match_code "parallel")
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
new file mode 100644
index 0000000000000000000000000000000000000000..8cabc05b0d739dbfdcecf681348b62634fcfc9a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
@@ -0,0 +1,75 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.4-a -O2" } */
+
+#include <stdatomic.h>
+
+typedef __INT8_TYPE__ int8_t;
+typedef __INT16_TYPE__ int16_t;
+typedef __INT32_TYPE__ int32_t;
+typedef __INT64_TYPE__ int64_t;
+
+#define STORE_TESTS(size) \
+  void \
+  foo##size (int##size##_t *atomic_vals) \
+{ \
+  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
+  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
+  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
+  atomic_store ((atomic_vals + 2), 2); \
+  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
+}
+
+STORE_TESTS (8);
+/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1  { target { ! ilp32 } } } } */
+/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 2  { target { ilp32 } } } } */
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */
+
+STORE_TESTS (16);
+/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */
+
+STORE_TESTS (32);
+/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */
+
+STORE_TESTS (64);
+/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */
+
+void
+foo_toolarge_offset (int64_t *atomic_vals)
+{
+  /* 9bit signed unscaled immediate =>
+	largest representable value +255.
+	smallest representable value -256.  */
+  atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
+  atomic_store_explicit (atomic_vals - 33, 2, memory_order_release);
+}
+
+void
+foo_negative (int8_t *atomic_vals)
+{
+  atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);
+}
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 { target { ! ilp32 } } } } */
+
+#pragma GCC target ("arch=armv8.3-a")
+void
+foo_older_arch (int64_t *atomic_vals)
+{
+  atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
+}
+
+/* Three times, one for each of the three above functions.  */
+/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 } } */