diff mbox series

AArch64: Fix __sync_val_compare_and_swap [PR111404]

Message ID PAWPR08MB89823B565A671851A16FD40B83F0A@PAWPR08MB8982.eurprd08.prod.outlook.com
State New
Headers show
Series AArch64: Fix __sync_val_compare_and_swap [PR111404] | expand

Commit Message

Wilco Dijkstra Sept. 13, 2023, 2:54 p.m. UTC
__sync_val_compare_and_swap may be used on 128-bit types and either calls the
outline atomic code or uses an inline loop.  On AArch64 LDXP is only atomic if
the value is stored successfully using STXP, but the current implementations
do not perform the store if the comparison fails.  In this case the value returned
is not read atomically.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog/
        PR target/111404
	* config/aarch64/aarch64.cc (aarch64_split_compare_and_swap):
        For 128-bit store the loaded value and loop if needed.

libgcc/ChangeLog/
        PR target/111404
        * config/aarch64/lse.S (__aarch64_cas16_acq_rel): Execute STLXP using
        either new value or loaded value.

---

Comments

Ramana Radhakrishnan Sept. 25, 2023, 8:54 p.m. UTC | #1
On Wed, Sep 13, 2023 at 3:55 PM Wilco Dijkstra via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
>
> __sync_val_compare_and_swap may be used on 128-bit types and either calls the
> outline atomic code or uses an inline loop.  On AArch64 LDXP is only atomic if
> the value is stored successfully using STXP, but the current implementations
> do not perform the store if the comparison fails.  In this case the value returned
> is not read atomically.

IIRC, the previous discussions in this space revolved around the
difficulty with the store writing to readonly memory which is why I
think we went with LDXP in this form.
Has something changed from then ?

Reviewed-by : Ramana Radhakrishnan  <ramana@gcc.gnu.org>

regards
Ramana




>
> Passes regress/bootstrap, OK for commit?
>
> gcc/ChangeLog/
>         PR target/111404
>         * config/aarch64/aarch64.cc (aarch64_split_compare_and_swap):
>         For 128-bit store the loaded value and loop if needed.
>
> libgcc/ChangeLog/
>         PR target/111404
>         * config/aarch64/lse.S (__aarch64_cas16_acq_rel): Execute STLXP using
>         either new value or loaded value.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 5e8d0a0c91bc7719de2a8c5627b354cf905a4db0..c44c0b979d0cc3755c61dcf566cfddedccebf1ea 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -23413,11 +23413,11 @@ aarch64_split_compare_and_swap (rtx operands[])
>    mem = operands[1];
>    oldval = operands[2];
>    newval = operands[3];
> -  is_weak = (operands[4] != const0_rtx);
>    model_rtx = operands[5];
>    scratch = operands[7];
>    mode = GET_MODE (mem);
>    model = memmodel_from_int (INTVAL (model_rtx));
> +  is_weak = operands[4] != const0_rtx && mode != TImode;
>
>    /* When OLDVAL is zero and we want the strong version we can emit a tighter
>      loop:
> @@ -23478,6 +23478,33 @@ aarch64_split_compare_and_swap (rtx operands[])
>    else
>      aarch64_gen_compare_reg (NE, scratch, const0_rtx);
>
> +  /* 128-bit LDAXP is not atomic unless STLXP succeeds.  So for a mismatch,
> +     store the returned value and loop if the STLXP fails.  */
> +  if (mode == TImode)
> +    {
> +      rtx_code_label *label3 = gen_label_rtx ();
> +      emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
> +      emit_barrier ();
> +
> +      emit_label (label2);
> +      aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
> +
> +      if (aarch64_track_speculation)
> +       {
> +         /* Emit an explicit compare instruction, so that we can correctly
> +            track the condition codes.  */
> +         rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
> +         x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
> +       }
> +      else
> +       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
> +      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
> +                               gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
> +      aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
> +
> +      label2 = label3;
> +    }
> +
>    emit_label (label2);
>
>    /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
> diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
> index dde3a28e07b13669533dfc5e8fac0a9a6ac33dbd..ba05047ff02b6fc5752235bffa924fc4a2f48c04 100644
> --- a/libgcc/config/aarch64/lse.S
> +++ b/libgcc/config/aarch64/lse.S
> @@ -160,6 +160,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>  #define tmp0   16
>  #define tmp1   17
>  #define tmp2   15
> +#define tmp3   14
> +#define tmp4   13
>
>  #define BTI_C  hint    34
>
> @@ -233,10 +235,11 @@ STARTFN   NAME(cas)
>  0:     LDXP            x0, x1, [x4]
>         cmp             x0, x(tmp0)
>         ccmp            x1, x(tmp1), #0, eq
> -       bne             1f
> -       STXP            w(tmp2), x2, x3, [x4]
> -       cbnz            w(tmp2), 0b
> -1:     BARRIER
> +       csel            x(tmp2), x2, x0, eq
> +       csel            x(tmp3), x3, x1, eq
> +       STXP            w(tmp4), x(tmp2), x(tmp3), [x4]
> +       cbnz            w(tmp4), 0b
> +       BARRIER
>         ret
>
>  #endif
>
Wilco Dijkstra Sept. 25, 2023, 11:07 p.m. UTC | #2
Hi Ramana,

>> __sync_val_compare_and_swap may be used on 128-bit types and either calls the
>> outline atomic code or uses an inline loop.  On AArch64 LDXP is only atomic if
>> the value is stored successfully using STXP, but the current implementations
>> do not perform the store if the comparison fails.  In this case the value returned
>> is not read atomically.
>
> IIRC, the previous discussions in this space revolved around the
> difficulty with the store writing to readonly memory which is why I
> think we went with LDXP in this form.

That's not related to this patch - this fixes a serious atomicity bug that may
affect the Linux kernel since it uses the older sync primitives. Given that LDXP
is not atomic on its own, you have to execute the STXP even in the failure case.
Note that you can't rely on compare&swap not to write memory: load-exclusive
loops may either always write or avoid writes in the failure case if the load is
atomic. CAS instructions always write.

> Has something changed from then ?

Yes, we now know that using locking atomics was a bad decision. Developers
actually require efficient and lock-free atomics. Since we didn't support them,
many applications were forced to add their own atomic implementations using
hacky inline assembler. It also resulted in a nasty ABI incompatibility between
GCC and LLVM. Yes - atomics are part of the ABI!

All that is much worse than worrying about a theoretical corner case that
can't happen in real applications - atomics only work on writeable memory
since their purpose is to synchronize reads with writes.

Cheers,
Wilco
Ramana Radhakrishnan Sept. 26, 2023, 10:23 p.m. UTC | #3
Hi Wilco,

Thanks for your email.

On Tue, Sep 26, 2023 at 12:07 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi Ramana,
>
> >> __sync_val_compare_and_swap may be used on 128-bit types and either calls the
> >> outline atomic code or uses an inline loop.  On AArch64 LDXP is only atomic if
> >> the value is stored successfully using STXP, but the current implementations
> >> do not perform the store if the comparison fails.  In this case the value returned
> >> is not read atomically.
> >
> > IIRC, the previous discussions in this space revolved around the
> > difficulty with the store writing to readonly memory which is why I
> > think we went with LDXP in this form.
>
> That's not related to this patch - this fixes a serious atomicity bug that may
> affect the Linux kernel since it uses the older sync primitives. Given that LDXP
> is not atomic on its own, you have to execute the STXP even in the failure case.
> Note that you can't rely on compare&swap not to write memory: load-exclusive
> loops may either always write or avoid writes in the failure case if the load is
> atomic. CAS instructions always write.
>

I am aware of the capabilities of the architecture.

> > Has something changed from then ?
>
> Yes, we now know that using locking atomics was a bad decision. Developers
> actually require efficient and lock-free atomics. Since we didn't support them,
> many applications were forced to add their own atomic implementations using
> hacky inline assembler. It also resulted in a nasty ABI incompatibility between
> GCC and LLVM. Yes - atomics are part of the ABI!

I agree that atomics are part of the ABI.

>
> All that is much worse than worrying about a theoretical corner case that
> can't happen in real applications - atomics only work on writeable memory
> since their purpose is to synchronize reads with writes.


I remember this to be the previous discussions and common understanding.

https://gcc.gnu.org/legacy-ml/gcc/2016-06/msg00017.html

and here

https://gcc.gnu.org/legacy-ml/gcc-patches/2017-02/msg00168.html

Can you point any discussion recently that shows this has changed and
point me at that discussion if any anywhere ? I can't find it in my
searches . Perhaps you've had the discussion some place to show it has
changed.


regards
Ramana



>
> Cheers,
> Wilco
Wilco Dijkstra Oct. 16, 2023, 1:04 p.m. UTC | #4
ping
 

__sync_val_compare_and_swap may be used on 128-bit types and either calls the
outline atomic code or uses an inline loop.  On AArch64 LDXP is only atomic if
the value is stored successfully using STXP, but the current implementations
do not perform the store if the comparison fails.  In this case the value returned
is not read atomically.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog/
        PR target/111404
        * config/aarch64/aarch64.cc (aarch64_split_compare_and_swap):
        For 128-bit store the loaded value and loop if needed.

libgcc/ChangeLog/
        PR target/111404
        * config/aarch64/lse.S (__aarch64_cas16_acq_rel): Execute STLXP using
        either new value or loaded value.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 5e8d0a0c91bc7719de2a8c5627b354cf905a4db0..c44c0b979d0cc3755c61dcf566cfddedccebf1ea 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23413,11 +23413,11 @@ aarch64_split_compare_and_swap (rtx operands[])
   mem = operands[1];
   oldval = operands[2];
   newval = operands[3];
-  is_weak = (operands[4] != const0_rtx);
   model_rtx = operands[5];
   scratch = operands[7];
   mode = GET_MODE (mem);
   model = memmodel_from_int (INTVAL (model_rtx));
+  is_weak = operands[4] != const0_rtx && mode != TImode;
 
   /* When OLDVAL is zero and we want the strong version we can emit a tighter
     loop:
@@ -23478,6 +23478,33 @@ aarch64_split_compare_and_swap (rtx operands[])
   else
     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
 
+  /* 128-bit LDAXP is not atomic unless STLXP succeeds.  So for a mismatch,
+     store the returned value and loop if the STLXP fails.  */
+  if (mode == TImode)
+    {
+      rtx_code_label *label3 = gen_label_rtx ();
+      emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
+      emit_barrier ();
+
+      emit_label (label2);
+      aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
+
+      if (aarch64_track_speculation)
+       {
+         /* Emit an explicit compare instruction, so that we can correctly
+            track the condition codes.  */
+         rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
+         x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
+       }
+      else
+       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
+      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+                               gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
+      aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+
+      label2 = label3;
+    }
+
   emit_label (label2);
 
   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
index dde3a28e07b13669533dfc5e8fac0a9a6ac33dbd..ba05047ff02b6fc5752235bffa924fc4a2f48c04 100644
--- a/libgcc/config/aarch64/lse.S
+++ b/libgcc/config/aarch64/lse.S
@@ -160,6 +160,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define tmp0    16
 #define tmp1    17
 #define tmp2    15
+#define tmp3   14
+#define tmp4   13
 
 #define BTI_C   hint    34
 
@@ -233,10 +235,11 @@ STARTFN   NAME(cas)
 0:      LDXP            x0, x1, [x4]
         cmp             x0, x(tmp0)
         ccmp            x1, x(tmp1), #0, eq
-       bne             1f
-       STXP            w(tmp2), x2, x3, [x4]
-       cbnz            w(tmp2), 0b
-1:     BARRIER
+       csel            x(tmp2), x2, x0, eq
+       csel            x(tmp3), x3, x1, eq
+       STXP            w(tmp4), x(tmp2), x(tmp3), [x4]
+       cbnz            w(tmp4), 0b
+       BARRIER
         ret
 
 #endif
Wilco Dijkstra Oct. 16, 2023, 2:59 p.m. UTC | #5
Hi Ramana,

> I remember this to be the previous discussions and common understanding.
>
> https://gcc.gnu.org/legacy-ml/gcc/2016-06/msg00017.html
>
> and here
> 
> https://gcc.gnu.org/legacy-ml/gcc-patches/2017-02/msg00168.html
>
> Can you point any discussion recently that shows this has changed and
> point me at that discussion if any anywhere ? I can't find it in my
> searches . Perhaps you've had the discussion some place to show it has
> changed.

Here are some more recent discussions about atomics, eg. this has good
arguments from developers wanting lock-free atomics:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80878

We also had some discussion how we could handle the read-only corner
case by either giving a warning/error on const pointers to atomics or
ensuring _Atomic variables are writeable:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108659
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109553

My conclusion from that is that nobody cared enough to fix this for x86
in all these years, so it's not seen as an important issue.

We've had several internal design discussions to figure out how to fix the ABI
issues. The conclusion was that this is the only possible solution that makes
GCC and LLVM compatible without breaking backwards compatibility. It also
allows use of newer atomic instructions (which people want inlined).

Cheers,
Wilco
Wilco Dijkstra Nov. 6, 2023, 12:12 p.m. UTC | #6
ping
 

__sync_val_compare_and_swap may be used on 128-bit types and either calls the
outline atomic code or uses an inline loop.  On AArch64 LDXP is only atomic if
the value is stored successfully using STXP, but the current implementations
do not perform the store if the comparison fails.  In this case the value returned
is not read atomically.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog/
        PR target/111404
        * config/aarch64/aarch64.cc (aarch64_split_compare_and_swap):
        For 128-bit store the loaded value and loop if needed.

libgcc/ChangeLog/
        PR target/111404
        * config/aarch64/lse.S (__aarch64_cas16_acq_rel): Execute STLXP using
        either new value or loaded value.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 5e8d0a0c91bc7719de2a8c5627b354cf905a4db0..c44c0b979d0cc3755c61dcf566cfddedccebf1ea 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23413,11 +23413,11 @@ aarch64_split_compare_and_swap (rtx operands[])
   mem = operands[1];
   oldval = operands[2];
   newval = operands[3];
-  is_weak = (operands[4] != const0_rtx);
   model_rtx = operands[5];
   scratch = operands[7];
   mode = GET_MODE (mem);
   model = memmodel_from_int (INTVAL (model_rtx));
+  is_weak = operands[4] != const0_rtx && mode != TImode;
 
   /* When OLDVAL is zero and we want the strong version we can emit a tighter
     loop:
@@ -23478,6 +23478,33 @@ aarch64_split_compare_and_swap (rtx operands[])
   else
     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
 
+  /* 128-bit LDAXP is not atomic unless STLXP succeeds.  So for a mismatch,
+     store the returned value and loop if the STLXP fails.  */
+  if (mode == TImode)
+    {
+      rtx_code_label *label3 = gen_label_rtx ();
+      emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
+      emit_barrier ();
+
+      emit_label (label2);
+      aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
+
+      if (aarch64_track_speculation)
+       {
+         /* Emit an explicit compare instruction, so that we can correctly
+            track the condition codes.  */
+         rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
+         x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
+       }
+      else
+       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
+      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+                               gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
+      aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+
+      label2 = label3;
+    }
+
   emit_label (label2);
 
   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
index dde3a28e07b13669533dfc5e8fac0a9a6ac33dbd..ba05047ff02b6fc5752235bffa924fc4a2f48c04 100644
--- a/libgcc/config/aarch64/lse.S
+++ b/libgcc/config/aarch64/lse.S
@@ -160,6 +160,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define tmp0    16
 #define tmp1    17
 #define tmp2    15
+#define tmp3   14
+#define tmp4   13
 
 #define BTI_C   hint    34
 
@@ -233,10 +235,11 @@ STARTFN   NAME(cas)
 0:      LDXP            x0, x1, [x4]
         cmp             x0, x(tmp0)
         ccmp            x1, x(tmp1), #0, eq
-       bne             1f
-       STXP            w(tmp2), x2, x3, [x4]
-       cbnz            w(tmp2), 0b
-1:     BARRIER
+       csel            x(tmp2), x2, x0, eq
+       csel            x(tmp3), x3, x1, eq
+       STXP            w(tmp4), x(tmp2), x(tmp3), [x4]
+       cbnz            w(tmp4), 0b
+       BARRIER
         ret
 
 #endif
Richard Sandiford Nov. 30, 2023, 10:59 a.m. UTC | #7
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> __sync_val_compare_and_swap may be used on 128-bit types and either calls the
> outline atomic code or uses an inline loop.  On AArch64 LDXP is only atomic if
> the value is stored successfully using STXP, but the current implementations
> do not perform the store if the comparison fails.  In this case the value returned
> is not read atomically.
>
> Passes regress/bootstrap, OK for commit?
>
> gcc/ChangeLog/
>         PR target/111404
>         * config/aarch64/aarch64.cc (aarch64_split_compare_and_swap):
>         For 128-bit store the loaded value and loop if needed.
>
> libgcc/ChangeLog/
>         PR target/111404
>         * config/aarch64/lse.S (__aarch64_cas16_acq_rel): Execute STLXP using
>         either new value or loaded value.

Thanks for catching this.

The new aarch64_split_compare_and_swap code looks a bit twisty.
The approach in lse.S seems more obvious.  But I'm guessing you
didn't want to spend any time restructuring the pre-LSE
-mno-outline-atomics code, and I agree the patch in its current
form is more backportable.

So the patch is OK, thanks.  Sorry for the delay.

I suppose we might want to backport this after it has been in trunk
for a bit.

Richard

> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 5e8d0a0c91bc7719de2a8c5627b354cf905a4db0..c44c0b979d0cc3755c61dcf566cfddedccebf1ea 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -23413,11 +23413,11 @@ aarch64_split_compare_and_swap (rtx operands[])
>    mem = operands[1];
>    oldval = operands[2];
>    newval = operands[3];
> -  is_weak = (operands[4] != const0_rtx);
>    model_rtx = operands[5];
>    scratch = operands[7];
>    mode = GET_MODE (mem);
>    model = memmodel_from_int (INTVAL (model_rtx));
> +  is_weak = operands[4] != const0_rtx && mode != TImode;
>
>    /* When OLDVAL is zero and we want the strong version we can emit a tighter
>      loop:
> @@ -23478,6 +23478,33 @@ aarch64_split_compare_and_swap (rtx operands[])
>    else
>      aarch64_gen_compare_reg (NE, scratch, const0_rtx);
>
> +  /* 128-bit LDAXP is not atomic unless STLXP succeeds.  So for a mismatch,
> +     store the returned value and loop if the STLXP fails.  */
> +  if (mode == TImode)
> +    {
> +      rtx_code_label *label3 = gen_label_rtx ();
> +      emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
> +      emit_barrier ();
> +
> +      emit_label (label2);
> +      aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
> +
> +      if (aarch64_track_speculation)
> +       {
> +         /* Emit an explicit compare instruction, so that we can correctly
> +            track the condition codes.  */
> +         rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
> +         x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
> +       }
> +      else
> +       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
> +      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
> +                               gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
> +      aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
> +
> +      label2 = label3;
> +    }
> +
>    emit_label (label2);
>
>    /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
> diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
> index dde3a28e07b13669533dfc5e8fac0a9a6ac33dbd..ba05047ff02b6fc5752235bffa924fc4a2f48c04 100644
> --- a/libgcc/config/aarch64/lse.S
> +++ b/libgcc/config/aarch64/lse.S
> @@ -160,6 +160,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>  #define tmp0   16
>  #define tmp1   17
>  #define tmp2   15
> +#define tmp3   14
> +#define tmp4   13
>
>  #define BTI_C  hint    34
>
> @@ -233,10 +235,11 @@ STARTFN   NAME(cas)
>  0:     LDXP            x0, x1, [x4]
>         cmp             x0, x(tmp0)
>         ccmp            x1, x(tmp1), #0, eq
> -       bne             1f
> -       STXP            w(tmp2), x2, x3, [x4]
> -       cbnz            w(tmp2), 0b
> -1:     BARRIER
> +       csel            x(tmp2), x2, x0, eq
> +       csel            x(tmp3), x3, x1, eq
> +       STXP            w(tmp4), x(tmp2), x(tmp3), [x4]
> +       cbnz            w(tmp4), 0b
> +       BARRIER
>         ret
>
>  #endif
Wilco Dijkstra Nov. 30, 2023, 6:02 p.m. UTC | #8
Hi Richard,

Thanks for the review, now committed.

> The new aarch64_split_compare_and_swap code looks a bit twisty.
> The approach in lse.S seems more obvious.  But I'm guessing you
> didn't want to spend any time restructuring the pre-LSE
> -mno-outline-atomics code, and I agree the patch in its current
> form is more backportable.

Indeed this code needs cleaning up - all the complex speculation stuff
should be behind a simple interface. I was thinking of emitting CSEL
here but it would require adding new TI mode patterns or manually
splitting into low/high parts and emitting CSEL.

> I suppose we might want to backport this after it has been in trunk
> for a bit.

Yes that was my plan.

Cheers,
Wilco
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 5e8d0a0c91bc7719de2a8c5627b354cf905a4db0..c44c0b979d0cc3755c61dcf566cfddedccebf1ea 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23413,11 +23413,11 @@  aarch64_split_compare_and_swap (rtx operands[])
   mem = operands[1];
   oldval = operands[2];
   newval = operands[3];
-  is_weak = (operands[4] != const0_rtx);
   model_rtx = operands[5];
   scratch = operands[7];
   mode = GET_MODE (mem);
   model = memmodel_from_int (INTVAL (model_rtx));
+  is_weak = operands[4] != const0_rtx && mode != TImode;
 
   /* When OLDVAL is zero and we want the strong version we can emit a tighter
     loop:
@@ -23478,6 +23478,33 @@  aarch64_split_compare_and_swap (rtx operands[])
   else
     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
 
+  /* 128-bit LDAXP is not atomic unless STLXP succeeds.  So for a mismatch,
+     store the returned value and loop if the STLXP fails.  */
+  if (mode == TImode)
+    {
+      rtx_code_label *label3 = gen_label_rtx ();
+      emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
+      emit_barrier ();
+
+      emit_label (label2);
+      aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
+
+      if (aarch64_track_speculation)
+	{
+	  /* Emit an explicit compare instruction, so that we can correctly
+	     track the condition codes.  */
+	  rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
+	  x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
+	}
+      else
+	x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
+      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+				gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
+      aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+
+      label2 = label3;
+    }
+
   emit_label (label2);
 
   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
index dde3a28e07b13669533dfc5e8fac0a9a6ac33dbd..ba05047ff02b6fc5752235bffa924fc4a2f48c04 100644
--- a/libgcc/config/aarch64/lse.S
+++ b/libgcc/config/aarch64/lse.S
@@ -160,6 +160,8 @@  see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define tmp0	16
 #define tmp1	17
 #define tmp2	15
+#define tmp3	14
+#define tmp4	13
 
 #define BTI_C	hint	34
 
@@ -233,10 +235,11 @@  STARTFN	NAME(cas)
 0:	LDXP		x0, x1, [x4]
 	cmp		x0, x(tmp0)
 	ccmp		x1, x(tmp1), #0, eq
-	bne		1f
-	STXP		w(tmp2), x2, x3, [x4]
-	cbnz		w(tmp2), 0b
-1:	BARRIER
+	csel		x(tmp2), x2, x0, eq
+	csel		x(tmp3), x3, x1, eq
+	STXP		w(tmp4), x(tmp2), x(tmp3), [x4]
+	cbnz		w(tmp4), 0b
+	BARRIER
 	ret
 
 #endif