diff mbox

[1/2] atomics: do not use __atomic primitives for RCU atomics

Message ID 1463863336-28760-2-git-send-email-cota@braap.org
State New
Headers show

Commit Message

Emilio Cota May 21, 2016, 8:42 p.m. UTC
Commit a0aa44b4 ("include/qemu/atomic.h: default to __atomic functions")
set all atomics to default (on recent GCC versions) to __atomic primitives.

In the process, the atomic_rcu_read/set were converted to implement
consume/release semantics, respectively. This is inefficient; for
correctness and maximum performance we only need an smp_barrier_depends
for reads, and an smp_wmb for writes. Fix it by using the original
definition of these two primitives for all compilers.

Note that in case these semantics were necessary to avoid false
positives under Thread Sanitizer, we could have them defined as such
under #ifdef __SANITIZE_THREAD__. I tried running QEMU under tsan
to explore this, but unfortunately I couldn't get it to work (tsan dies
with an "unexpected memory mapping"). I suspect though that the
atomic_read/set embedded in atomic_rcu_read/set is enough for tsan,
though.

Signed-off-by: Emilio G. Cota <cota@braap.org>
---
 include/qemu/atomic.h | 99 ++++++++++++++++++++++-----------------------------
 1 file changed, 43 insertions(+), 56 deletions(-)

Comments

Alex Bennée May 22, 2016, 7:58 a.m. UTC | #1
Emilio G. Cota <cota@braap.org> writes:

> Commit a0aa44b4 ("include/qemu/atomic.h: default to __atomic functions")
> set all atomics to default (on recent GCC versions) to __atomic primitives.
>
> In the process, the atomic_rcu_read/set were converted to implement
> consume/release semantics, respectively. This is inefficient; for
> correctness and maximum performance we only need an smp_barrier_depends
> for reads, and an smp_wmb for writes. Fix it by using the original
> definition of these two primitives for all compilers.
>
> Note that in case these semantics were necessary to avoid false
> positives under Thread Sanitizer, we could have them defined as such
> under #ifdef __SANITIZE_THREAD__. I tried running QEMU under tsan
> to explore this, but unfortunately I couldn't get it to work (tsan dies
> with an "unexpected memory mapping"). I suspect though that the
> atomic_read/set embedded in atomic_rcu_read/set is enough for tsan,
> though.

For tsan runs you need to re-build with:

  ./configure --cc=gcc --extra-cflags="-pie -fPIE -fsanitize=thread" --with-coroutine=gthread

Specifically the coroutine ucontext messing really confuses TSAN.

>
> Signed-off-by: Emilio G. Cota <cota@braap.org>
> ---
>  include/qemu/atomic.h | 99 ++++++++++++++++++++++-----------------------------
>  1 file changed, 43 insertions(+), 56 deletions(-)
>
> diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
> index 5bc4d6c..4d62425 100644
> --- a/include/qemu/atomic.h
> +++ b/include/qemu/atomic.h
> @@ -56,22 +56,6 @@
>      __atomic_store(ptr, &_val, __ATOMIC_RELAXED);     \
>  } while(0)
>
> -/* Atomic RCU operations imply weak memory barriers */
> -
> -#define atomic_rcu_read(ptr)                          \
> -    ({                                                \
> -    QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \
> -    typeof(*ptr) _val;                                \
> -    __atomic_load(ptr, &_val, __ATOMIC_CONSUME);      \
> -    _val;                                             \
> -    })
> -
> -#define atomic_rcu_set(ptr, i) do {                   \
> -    QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \
> -    typeof(*ptr) _val = (i);                          \
> -    __atomic_store(ptr, &_val, __ATOMIC_RELEASE);     \
> -} while(0)
> -
>  /* atomic_mb_read/set semantics map Java volatile variables. They are
>   * less expensive on some platforms (notably POWER & ARMv7) than fully
>   * sequentially consistent operations.
> @@ -242,46 +226,6 @@
>  #define atomic_read(ptr)       (*(__typeof__(*ptr) volatile*) (ptr))
>  #define atomic_set(ptr, i)     ((*(__typeof__(*ptr) volatile*) (ptr)) = (i))
>
> -/**
> - * atomic_rcu_read - reads a RCU-protected pointer to a local variable
> - * into a RCU read-side critical section. The pointer can later be safely
> - * dereferenced within the critical section.
> - *
> - * This ensures that the pointer copy is invariant thorough the whole critical
> - * section.
> - *
> - * Inserts memory barriers on architectures that require them (currently only
> - * Alpha) and documents which pointers are protected by RCU.
> - *
> - * atomic_rcu_read also includes a compiler barrier to ensure that
> - * value-speculative optimizations (e.g. VSS: Value Speculation
> - * Scheduling) does not perform the data read before the pointer read
> - * by speculating the value of the pointer.
> - *
> - * Should match atomic_rcu_set(), atomic_xchg(), atomic_cmpxchg().
> - */
> -#define atomic_rcu_read(ptr)    ({                \
> -    typeof(*ptr) _val = atomic_read(ptr);         \
> -    smp_read_barrier_depends();                   \
> -    _val;                                         \
> -})
> -
> -/**
> - * atomic_rcu_set - assigns (publicizes) a pointer to a new data structure
> - * meant to be read by RCU read-side critical sections.
> - *
> - * Documents which pointers will be dereferenced by RCU read-side critical
> - * sections and adds the required memory barriers on architectures requiring
> - * them. It also makes sure the compiler does not reorder code initializing the
> - * data structure before its publication.
> - *
> - * Should match atomic_rcu_read().
> - */
> -#define atomic_rcu_set(ptr, i)  do {              \
> -    smp_wmb();                                    \
> -    atomic_set(ptr, i);                           \
> -} while (0)
> -
>  /* These have the same semantics as Java volatile variables.
>   * See http://gee.cs.oswego.edu/dl/jmm/cookbook.html:
>   * "1. Issue a StoreStore barrier (wmb) before each volatile store."
> @@ -345,4 +289,47 @@
>  #define atomic_or(ptr, n)      ((void) __sync_fetch_and_or(ptr, n))
>
>  #endif /* __ATOMIC_RELAXED */
> +
> +/**
> + * atomic_rcu_read - reads a RCU-protected pointer to a local variable
> + * into a RCU read-side critical section. The pointer can later be safely
> + * dereferenced within the critical section.
> + *
> + * This ensures that the pointer copy is invariant thorough the whole critical
> + * section.
> + *
> + * Inserts memory barriers on architectures that require them (currently only
> + * Alpha) and documents which pointers are protected by RCU.
> + *
> + * atomic_rcu_read also includes a compiler barrier to ensure that
> + * value-speculative optimizations (e.g. VSS: Value Speculation
> + * Scheduling) does not perform the data read before the pointer read
> + * by speculating the value of the pointer.
> + *
> + * Should match atomic_rcu_set(), atomic_xchg(), atomic_cmpxchg().
> + */
> +#define atomic_rcu_read(ptr)    ({                    \
> +    QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \
> +    typeof(*ptr) _val = atomic_read(ptr);             \
> +    smp_read_barrier_depends();                       \
> +    _val;                                             \
> +})
> +
> +/**
> + * atomic_rcu_set - assigns (publicizes) a pointer to a new data structure
> + * meant to be read by RCU read-side critical sections.
> + *
> + * Documents which pointers will be dereferenced by RCU read-side critical
> + * sections and adds the required memory barriers on architectures requiring
> + * them. It also makes sure the compiler does not reorder code initializing the
> + * data structure before its publication.
> + *
> + * Should match atomic_rcu_read().
> + */
> +#define atomic_rcu_set(ptr, i)  do {                  \
> +    QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \
> +    smp_wmb();                                        \
> +    atomic_set(ptr, i);                               \
> +} while (0)
> +
>  #endif /* __QEMU_ATOMIC_H */


--
Alex Bennée
Paolo Bonzini May 23, 2016, 2:21 p.m. UTC | #2
On 21/05/2016 22:42, Emilio G. Cota wrote:
> Commit a0aa44b4 ("include/qemu/atomic.h: default to __atomic functions")
> set all atomics to default (on recent GCC versions) to __atomic primitives.
> 
> In the process, the atomic_rcu_read/set were converted to implement
> consume/release semantics, respectively. This is inefficient; for
> correctness and maximum performance we only need an smp_barrier_depends
> for reads, and an smp_wmb for writes. Fix it by using the original
> definition of these two primitives for all compilers.

Indeed most compilers implement consume the same as acquire, which is
inefficient.  However, isn't in practice atomic_thread_fence(release) +
atomic_store(relaxed) the same as atomic_store(release)?

Thanks,

Paolo
Emilio Cota May 23, 2016, 3:55 p.m. UTC | #3
On Mon, May 23, 2016 at 16:21:36 +0200, Paolo Bonzini wrote:
> On 21/05/2016 22:42, Emilio G. Cota wrote:
> > Commit a0aa44b4 ("include/qemu/atomic.h: default to __atomic functions")
> > set all atomics to default (on recent GCC versions) to __atomic primitives.
> > 
> > In the process, the atomic_rcu_read/set were converted to implement
> > consume/release semantics, respectively. This is inefficient; for
> > correctness and maximum performance we only need an smp_barrier_depends
> > for reads, and an smp_wmb for writes. Fix it by using the original
> > definition of these two primitives for all compilers.
> 
> Indeed most compilers implement consume the same as acquire, which is
> inefficient.
> However, isn't in practice atomic_thread_fence(release) +
> atomic_store(relaxed) the same as atomic_store(release)?

Yes. However this is not the issue I'm addressing with the patch.

The performance regression I measured is due to using load-acquire vs.
load+smp_read_barrier_depends(). In the latter case only Alpha will
emit a fence; in the former we always emit store-release, which
is "stronger" (i.e. more constraining.)

A similar thing applies to atomic_rcu_write, although I haven't
measured its impact. We only need smp_wmb+store, yet we emit a
store-release, which is again "stronger".

		E.
Richard Henderson May 23, 2016, 4:53 p.m. UTC | #4
On 05/21/2016 01:42 PM, Emilio G. Cota wrote:
> In the process, the atomic_rcu_read/set were converted to implement
> consume/release semantics, respectively. This is inefficient; for
> correctness and maximum performance we only need an smp_barrier_depends
> for reads, and an smp_wmb for writes. Fix it by using the original
> definition of these two primitives for all compilers.

For what host do you think this is inefficient?

In particular, what you've done is going to be less efficient for e.g. armv8, 
where the __atomic formulation is going to produce load-acquire and 
store-release instructions.  Whereas the separate barriers are going to produce 
two insns.

As for the common case of x86_64, what you're doing is going to make no 
difference at all.

So what are you trying to improve?


r~
Emilio Cota May 23, 2016, 5:09 p.m. UTC | #5
On Mon, May 23, 2016 at 09:53:00 -0700, Richard Henderson wrote:
> On 05/21/2016 01:42 PM, Emilio G. Cota wrote:
> >In the process, the atomic_rcu_read/set were converted to implement
> >consume/release semantics, respectively. This is inefficient; for
> >correctness and maximum performance we only need an smp_barrier_depends
> >for reads, and an smp_wmb for writes. Fix it by using the original
> >definition of these two primitives for all compilers.
> 
> For what host do you think this is inefficient?
> 
> In particular, what you've done is going to be less efficient for e.g.
> armv8, where the __atomic formulation is going to produce load-acquire and
> store-release instructions.  Whereas the separate barriers are going to
> produce two insns.
> 
> As for the common case of x86_64, what you're doing is going to make no
> difference at all.
> 
> So what are you trying to improve?

Precisely I tested this on ARMv8. The goal is to not emit a fence at
all, i.e. to emit a single store instead of LDR (load-acquire).

I just realised that under #ifdef __ATOMIC we have:

#define smp_read_barrier_depends() ({ barrier(); __atomic_thread_fence(__ATOMIC_CONSUME); barrier(); })

Why? This should be:

#ifdef __alpha__
#define smp_read_barrier_depends()   asm volatile("mb":::"memory")
#endif

unconditionally.

My patch should have included this additional change to make sense.
Sorry for the confusion.

		E.

PS. And really equating smp_wmb/rmb to release/acquire as we have under
#ifdef __ATOMIC is hard to justify, other than to please tsan.
Paolo Bonzini May 24, 2016, 7:08 a.m. UTC | #6
On 23/05/2016 19:09, Emilio G. Cota wrote:
> 		E.
> 
> PS. And really equating smp_wmb/rmb to release/acquire as we have under
> #ifdef __ATOMIC is hard to justify, other than to please tsan.

That only makes a difference on arm64, right?

	acquire		release		rmb		wmb
x86	--		--		--		--
power	lwsync		lwsync		lwsync		lwsync
armv7	dmb		dmb		dmb		dmb
arm64	dmb ishld	dmb ish		dmb ishld	dmb ishst
ia64	--		--		--		--

Thanks,

Paolo
Emilio Cota May 24, 2016, 7:56 p.m. UTC | #7
On Tue, May 24, 2016 at 09:08:01 +0200, Paolo Bonzini wrote:
> On 23/05/2016 19:09, Emilio G. Cota wrote:
> > PS. And really equating smp_wmb/rmb to release/acquire as we have under
> > #ifdef __ATOMIC is hard to justify, other than to please tsan.
> 
> That only makes a difference on arm64, right?
> 
> 	acquire		release		rmb		wmb
> x86	--		--		--		--
> power	lwsync		lwsync		lwsync		lwsync
> armv7	dmb		dmb		dmb		dmb
> arm64	dmb ishld	dmb ish		dmb ishld	dmb ishst
> ia64	--		--		--		--

Yes. I now see why we're defining rmb/wmb based on acquire/release:
it's quite convenient given that the compiler provides them, and
the (tiny) differences in practice are not worth the trouble of
adding asm for them. So I take back my comment =)

The gains of getting rid of the consume barrier from atomic_rcu_read
are clear though; updated patch to follow.

Thanks,

		Emilio
Sergey Fedorov May 24, 2016, 7:59 p.m. UTC | #8
On 24/05/16 22:56, Emilio G. Cota wrote:
> On Tue, May 24, 2016 at 09:08:01 +0200, Paolo Bonzini wrote:
>> On 23/05/2016 19:09, Emilio G. Cota wrote:
>>> PS. And really equating smp_wmb/rmb to release/acquire as we have under
>>> #ifdef __ATOMIC is hard to justify, other than to please tsan.
>> That only makes a difference on arm64, right?
>>
>> 	acquire		release		rmb		wmb
>> x86	--		--		--		--
>> power	lwsync		lwsync		lwsync		lwsync
>> armv7	dmb		dmb		dmb		dmb
>> arm64	dmb ishld	dmb ish		dmb ishld	dmb ishst
>> ia64	--		--		--		--
> Yes. I now see why we're defining rmb/wmb based on acquire/release:
> it's quite convenient given that the compiler provides them, and
> the (tiny) differences in practice are not worth the trouble of
> adding asm for them. So I take back my comment =)
>
> The gains of getting rid of the consume barrier from atomic_rcu_read
> are clear though; updated patch to follow.

However, maybe it's not such a pain to maintain an optimized version for
AArch64 in assembly :P

Best,
Sergey
Alex Bennée May 25, 2016, 8:52 a.m. UTC | #9
Sergey Fedorov <serge.fdrv@gmail.com> writes:

> On 24/05/16 22:56, Emilio G. Cota wrote:
>> On Tue, May 24, 2016 at 09:08:01 +0200, Paolo Bonzini wrote:
>>> On 23/05/2016 19:09, Emilio G. Cota wrote:
>>>> PS. And really equating smp_wmb/rmb to release/acquire as we have under
>>>> #ifdef __ATOMIC is hard to justify, other than to please tsan.
>>> That only makes a difference on arm64, right?
>>>
>>> 	acquire		release		rmb		wmb
>>> x86	--		--		--		--
>>> power	lwsync		lwsync		lwsync		lwsync
>>> armv7	dmb		dmb		dmb		dmb
>>> arm64	dmb ishld	dmb ish		dmb ishld	dmb ishst
>>> ia64	--		--		--		--
>> Yes. I now see why we're defining rmb/wmb based on acquire/release:
>> it's quite convenient given that the compiler provides them, and
>> the (tiny) differences in practice are not worth the trouble of
>> adding asm for them. So I take back my comment =)
>>
>> The gains of getting rid of the consume barrier from atomic_rcu_read
>> are clear though; updated patch to follow.
>
> However, maybe it's not such a pain to maintain an optimized version for
> AArch64 in assembly :P

Please don't. The advantage of the builtins is they are known by things
like tsan.

>
> Best,
> Sergey


--
Alex Bennée
Sergey Fedorov May 25, 2016, 11:02 a.m. UTC | #10
On 25/05/16 11:52, Alex Bennée wrote:
> Sergey Fedorov <serge.fdrv@gmail.com> writes:
>
>> On 24/05/16 22:56, Emilio G. Cota wrote:
>>> On Tue, May 24, 2016 at 09:08:01 +0200, Paolo Bonzini wrote:
>>>> On 23/05/2016 19:09, Emilio G. Cota wrote:
>>>>> PS. And really equating smp_wmb/rmb to release/acquire as we have under
>>>>> #ifdef __ATOMIC is hard to justify, other than to please tsan.
>>>> That only makes a difference on arm64, right?
>>>>
>>>> 	acquire		release		rmb		wmb
>>>> x86	--		--		--		--
>>>> power	lwsync		lwsync		lwsync		lwsync
>>>> armv7	dmb		dmb		dmb		dmb
>>>> arm64	dmb ishld	dmb ish		dmb ishld	dmb ishst
>>>> ia64	--		--		--		--
>>> Yes. I now see why we're defining rmb/wmb based on acquire/release:
>>> it's quite convenient given that the compiler provides them, and
>>> the (tiny) differences in practice are not worth the trouble of
>>> adding asm for them. So I take back my comment =)
>>>
>>> The gains of getting rid of the consume barrier from atomic_rcu_read
>>> are clear though; updated patch to follow.
>> However, maybe it's not such a pain to maintain an optimized version for
>> AArch64 in assembly :P
> Please don't. The advantage of the builtins is they are known by things
> like tsan.
>

We can always do:

    #if defined(__aarch64__) && !defined(__SANITIZE_THREAD__)
    /* AArch64 asm variant */
    #else
    /* GCC __atomic variant */
    #endif


Kind regards,
Sergey
diff mbox

Patch

diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
index 5bc4d6c..4d62425 100644
--- a/include/qemu/atomic.h
+++ b/include/qemu/atomic.h
@@ -56,22 +56,6 @@ 
     __atomic_store(ptr, &_val, __ATOMIC_RELAXED);     \
 } while(0)
 
-/* Atomic RCU operations imply weak memory barriers */
-
-#define atomic_rcu_read(ptr)                          \
-    ({                                                \
-    QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \
-    typeof(*ptr) _val;                                \
-    __atomic_load(ptr, &_val, __ATOMIC_CONSUME);      \
-    _val;                                             \
-    })
-
-#define atomic_rcu_set(ptr, i) do {                   \
-    QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \
-    typeof(*ptr) _val = (i);                          \
-    __atomic_store(ptr, &_val, __ATOMIC_RELEASE);     \
-} while(0)
-
 /* atomic_mb_read/set semantics map Java volatile variables. They are
  * less expensive on some platforms (notably POWER & ARMv7) than fully
  * sequentially consistent operations.
@@ -242,46 +226,6 @@ 
 #define atomic_read(ptr)       (*(__typeof__(*ptr) volatile*) (ptr))
 #define atomic_set(ptr, i)     ((*(__typeof__(*ptr) volatile*) (ptr)) = (i))
 
-/**
- * atomic_rcu_read - reads a RCU-protected pointer to a local variable
- * into a RCU read-side critical section. The pointer can later be safely
- * dereferenced within the critical section.
- *
- * This ensures that the pointer copy is invariant thorough the whole critical
- * section.
- *
- * Inserts memory barriers on architectures that require them (currently only
- * Alpha) and documents which pointers are protected by RCU.
- *
- * atomic_rcu_read also includes a compiler barrier to ensure that
- * value-speculative optimizations (e.g. VSS: Value Speculation
- * Scheduling) does not perform the data read before the pointer read
- * by speculating the value of the pointer.
- *
- * Should match atomic_rcu_set(), atomic_xchg(), atomic_cmpxchg().
- */
-#define atomic_rcu_read(ptr)    ({                \
-    typeof(*ptr) _val = atomic_read(ptr);         \
-    smp_read_barrier_depends();                   \
-    _val;                                         \
-})
-
-/**
- * atomic_rcu_set - assigns (publicizes) a pointer to a new data structure
- * meant to be read by RCU read-side critical sections.
- *
- * Documents which pointers will be dereferenced by RCU read-side critical
- * sections and adds the required memory barriers on architectures requiring
- * them. It also makes sure the compiler does not reorder code initializing the
- * data structure before its publication.
- *
- * Should match atomic_rcu_read().
- */
-#define atomic_rcu_set(ptr, i)  do {              \
-    smp_wmb();                                    \
-    atomic_set(ptr, i);                           \
-} while (0)
-
 /* These have the same semantics as Java volatile variables.
  * See http://gee.cs.oswego.edu/dl/jmm/cookbook.html:
  * "1. Issue a StoreStore barrier (wmb) before each volatile store."
@@ -345,4 +289,47 @@ 
 #define atomic_or(ptr, n)      ((void) __sync_fetch_and_or(ptr, n))
 
 #endif /* __ATOMIC_RELAXED */
+
+/**
+ * atomic_rcu_read - reads a RCU-protected pointer to a local variable
+ * into a RCU read-side critical section. The pointer can later be safely
+ * dereferenced within the critical section.
+ *
+ * This ensures that the pointer copy is invariant thorough the whole critical
+ * section.
+ *
+ * Inserts memory barriers on architectures that require them (currently only
+ * Alpha) and documents which pointers are protected by RCU.
+ *
+ * atomic_rcu_read also includes a compiler barrier to ensure that
+ * value-speculative optimizations (e.g. VSS: Value Speculation
+ * Scheduling) does not perform the data read before the pointer read
+ * by speculating the value of the pointer.
+ *
+ * Should match atomic_rcu_set(), atomic_xchg(), atomic_cmpxchg().
+ */
+#define atomic_rcu_read(ptr)    ({                    \
+    QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \
+    typeof(*ptr) _val = atomic_read(ptr);             \
+    smp_read_barrier_depends();                       \
+    _val;                                             \
+})
+
+/**
+ * atomic_rcu_set - assigns (publicizes) a pointer to a new data structure
+ * meant to be read by RCU read-side critical sections.
+ *
+ * Documents which pointers will be dereferenced by RCU read-side critical
+ * sections and adds the required memory barriers on architectures requiring
+ * them. It also makes sure the compiler does not reorder code initializing the
+ * data structure before its publication.
+ *
+ * Should match atomic_rcu_read().
+ */
+#define atomic_rcu_set(ptr, i)  do {                  \
+    QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \
+    smp_wmb();                                        \
+    atomic_set(ptr, i);                               \
+} while (0)
+
 #endif /* __QEMU_ATOMIC_H */