diff mbox

libstdc++: Improve code generation for atomic reference counting

Message ID 20161109212640.2552-1-suokkos@gmail.com
State New
Headers show

Commit Message

Pauli Nov. 9, 2016, 9:26 p.m. UTC
Atomic reference counting generates pessimistic code in platforms where
builtin atomics could optimize code for following branch with subtract
instruction.

To allow better code generation with compile time constant addition can
be checked for negative value. Those cases can then be better optimized
with __atomic_fetch_sub builtin.

Extra branch checking presence of threads in application generates a
lot extra instructions when builtin atomics often generate very few
instructions.

Can I assume that __builtin_constant_p is available in all compilers
with _GLIBCXX_ATOMIC_BUILTINS is available?
bits/stl_list.h assumes __builtin_constant_p is available inside
_GLIBCXX_USE_CXX11_ABI.

x86_64 test results:
FAIL: 22_locale/numpunct/members/char/3.cc execution test
FAIL: 22_locale/time_get/get_date/wchar_t/4.cc execution test
  of expected passes		11941
  of unexpected failures	2
  of expected failures		65
  of unsupported tests		244
---

2016-11-09  Pauli Nieminen  <suokkos@gmail.com>

	* include/ext/atomicity.h (__exchange_and_add): Optimize negative add
	with builtin __atomic_fetch_sub.
	(__atomic_add): Call __exchange_and_add to reuse same code
	(__exchange_adn_add_dispatch) (__atomic_add_dispatch): Optimize
	threading check if builtins generate inline instructions. 

---
 libstdc++-v3/include/ext/atomicity.h | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

Comments

Jonathan Wakely Dec. 6, 2016, 12:06 p.m. UTC | #1
I missed this patch until today and have only just seen it, sorry.

On 09/11/16 23:26 +0200, Pauli wrote:
>Atomic reference counting generates pessimistic code in platforms where
>builtin atomics could optimize code for following branch with subtract
>instruction.

Which targets are affected?

>To allow better code generation with compile time constant addition can
>be checked for negative value. Those cases can then be better optimized
>with __atomic_fetch_sub builtin.

OK, that makes sense.

>Extra branch checking presence of threads in application generates a
>lot extra instructions when builtin atomics often generate very few
>instructions.

I'd like to see more justification for this. My understanding was that
the __gthread_active_p() check was cheap in many cases, and that
things like std::string reference counting in single-threaded
applications benefit from avoiding the atomics.

I don't feel comfortable making this change without data.

>Can I assume that __builtin_constant_p is available in all compilers
>with _GLIBCXX_ATOMIC_BUILTINS is available?

Yes, that's fine. It's always available.

>--- a/libstdc++-v3/include/ext/atomicity.h
>+++ b/libstdc++-v3/include/ext/atomicity.h
>@@ -35,6 +35,14 @@
> #include <bits/gthr.h>
> #include <bits/atomic_word.h>
>
>+#if defined(__GCC_ATOMIC_INT_LOCK_FREE)

Isn't this always defined?

>+#define _GLIBCXX_INT_LOCK_FREE __GCC_ATOMIC_INT_LOCK_FREE
>+#else
>+#define _GLIBCXX_INT_LOCK_FREE 2

Why assume always lock-free if the macro isn't defined?

>+#endif
>+#define _GLIBCXX_ATOMIC_INLINE \
>+  (_GLIBCXX_ATOMIC_BUILTINS && _GLIBCXX_INT_LOCK_FREE == 2)
>+
> namespace __gnu_cxx _GLIBCXX_VISIBILITY(default)
> {
> _GLIBCXX_BEGIN_NAMESPACE_VERSION
diff mbox

Patch

diff --git a/libstdc++-v3/include/ext/atomicity.h b/libstdc++-v3/include/ext/atomicity.h
index 0fcb390..ecf5a9d 100644
--- a/libstdc++-v3/include/ext/atomicity.h
+++ b/libstdc++-v3/include/ext/atomicity.h
@@ -35,6 +35,14 @@ 
 #include <bits/gthr.h>
 #include <bits/atomic_word.h>
 
+#if defined(__GCC_ATOMIC_INT_LOCK_FREE)
+#define _GLIBCXX_INT_LOCK_FREE __GCC_ATOMIC_INT_LOCK_FREE
+#else
+#define _GLIBCXX_INT_LOCK_FREE 2
+#endif
+#define _GLIBCXX_ATOMIC_INLINE \
+  (_GLIBCXX_ATOMIC_BUILTINS && _GLIBCXX_INT_LOCK_FREE == 2)
+
 namespace __gnu_cxx _GLIBCXX_VISIBILITY(default)
 {
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
@@ -46,11 +54,16 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #ifdef _GLIBCXX_ATOMIC_BUILTINS
   static inline _Atomic_word 
   __exchange_and_add(volatile _Atomic_word* __mem, int __val)
-  { return __atomic_fetch_add(__mem, __val, __ATOMIC_ACQ_REL); }
+  {
+    if (__builtin_constant_p(__val) && __val < 0)
+      return __atomic_fetch_sub(__mem, 0 - __val, __ATOMIC_ACQ_REL);
+    else
+      return __atomic_fetch_add(__mem, __val, __ATOMIC_ACQ_REL);
+  }
 
   static inline void
   __atomic_add(volatile _Atomic_word* __mem, int __val)
-  { __atomic_fetch_add(__mem, __val, __ATOMIC_ACQ_REL); }
+  { __exchange_and_add(__mem, __val); }
 #else
   _Atomic_word
   __attribute__ ((__unused__))
@@ -78,7 +91,7 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
   __exchange_and_add_dispatch(_Atomic_word* __mem, int __val)
   {
 #ifdef __GTHREADS
-    if (__gthread_active_p())
+    if (_GLIBCXX_ATOMIC_INLINE || __gthread_active_p())
       return __exchange_and_add(__mem, __val);
     else
       return __exchange_and_add_single(__mem, __val);
@@ -92,7 +105,7 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
   __atomic_add_dispatch(_Atomic_word* __mem, int __val)
   {
 #ifdef __GTHREADS
-    if (__gthread_active_p())
+    if (_GLIBCXX_ATOMIC_INLINE || __gthread_active_p())
       __atomic_add(__mem, __val);
     else
       __atomic_add_single(__mem, __val);
@@ -104,6 +117,9 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace
 
+#undef _GLIBCXX_INT_LOCK_FREE
+#undef _GLIBCXX_ATOMIC_INLINE
+
 // Even if the CPU doesn't need a memory barrier, we need to ensure
 // that the compiler doesn't reorder memory accesses across the
 // barriers.