diff mbox

libitm: Fix HTM fastpath.

Message ID 1452251258.26597.235.camel@localhost.localdomain
State New
Headers show

Commit Message

Torvald Riegel Jan. 8, 2016, 11:07 a.m. UTC
This patch fixes a thinko in the HTM fastpath implementation.  In a
nutshell, we also need to monitor the HTM fastpath control (ie,
htm_fastpath) variable from within a HW transaction on the HTM fastpath,
so that such a HW transaciton will only execute if the HTM hastpath is
still enabled.

We move htm_fastpath into the serial lock so that a HW transaction only
needs one cacheline of HTM capacity to monitor both htm_fastpath and
check that no non-HW-transaction is currently running.

Tested on x86_64-linux.

2016-01-08  Torvald Riegel  <triegel@redhat.com>

	* beginend.cc (GTM::gtm_thread::serial_lock): Put on cacheline
	boundary.
	(htm_fastpath): Remove.
	(gtm_thread::begin_transaction): Fix HTM fastpath.
	(_ITM_commitTransaction): Adapt.
	(_ITM_commitTransactionEH): Adapt.
	* libitm/config/linux/rwlock.h (gtm_rwlock): Add htm_fastpath member
	and accessors.
	* libitm/config/posix/rwlock.h (gtm_rwlock): Likewise.
	* libitm/config/posix/rwlock.cc (gtm_rwlock::gtm_rwlock): Adapt.
	* libitm/config/x86/sjlj.S (_ITM_beginTransaction): Fix HTM fastpath.
	* libitm/libitm_i.h (htm_fastpath): Remove declaration.
	* libitm/method-serial.cc (htm_mg): Adapt.
	(gtm_thread::serialirr_mode): Adapt.
	* libitm/query.cc (_ITM_inTransaction, _ITM_getTransactionId): Adapt.
diff mbox

Patch

commit bfa6b5c51e110ca38c0ce5aa7234e7b719eee5a6
Author: Torvald Riegel <triegel@redhat.com>
Date:   Wed Jan 6 15:34:57 2016 +0100

    libitm: Fix HTM fastpath.

diff --git a/libitm/beginend.cc b/libitm/beginend.cc
index 367edc8..015704b 100644
--- a/libitm/beginend.cc
+++ b/libitm/beginend.cc
@@ -32,7 +32,11 @@  using namespace GTM;
 extern __thread gtm_thread_tls _gtm_thr_tls;
 #endif
 
-gtm_rwlock GTM::gtm_thread::serial_lock;
+// Put this at the start of a cacheline so that serial_lock's writers and
+// htm_fastpath fields are on the same cacheline, so that HW transactions
+// only have to pay one cacheline capacity to monitor both.
+gtm_rwlock GTM::gtm_thread::serial_lock
+  __attribute__((aligned(HW_CACHELINE_SIZE)));
 gtm_thread *GTM::gtm_thread::list_of_threads = 0;
 unsigned GTM::gtm_thread::number_of_threads = 0;
 
@@ -54,9 +58,6 @@  static pthread_mutex_t global_tid_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_key_t thr_release_key;
 static pthread_once_t thr_release_once = PTHREAD_ONCE_INIT;
 
-// See gtm_thread::begin_transaction.
-uint32_t GTM::htm_fastpath = 0;
-
 /* Allocate a transaction structure.  */
 void *
 GTM::gtm_thread::operator new (size_t s)
@@ -176,9 +177,11 @@  GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
   // lock's writer flag and thus abort if another thread is or becomes a
   // serial transaction.  Therefore, if the fastpath is enabled, then a
   // transaction is not executing as a HW transaction iff the serial lock is
-  // write-locked.  This allows us to use htm_fastpath and the serial lock's
-  // writer flag to reliable determine whether the current thread runs a HW
-  // transaction, and thus we do not need to maintain this information in
+  // write-locked.  Also, HW transactions monitor the fastpath control
+  // variable, so that they will only execute if dispatch_htm is still the
+  // current method group.  This allows us to use htm_fastpath and the serial
+  // lock's writers flag to reliable determine whether the current thread runs
+  // a HW transaction, and thus we do not need to maintain this information in
   // per-thread state.
   // If an uninstrumented code path is not available, we can still run
   // instrumented code from a HW transaction because the HTM fastpath kicks
@@ -190,9 +193,14 @@  GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
   // for any internal changes (e.g., they never abort visibly to the STM code
   // and thus do not trigger the standard retry handling).
 #ifndef HTM_CUSTOM_FASTPATH
-  if (likely(htm_fastpath && (prop & pr_hasNoAbort)))
+  if (likely(serial_lock.get_htm_fastpath() && (prop & pr_hasNoAbort)))
     {
-      for (uint32_t t = htm_fastpath; t; t--)
+      // Note that the snapshot of htm_fastpath that we take here could be
+      // outdated, and a different method group than dispatch_htm may have
+      // been chosen in the meantime.  Therefore, take care not not touch
+      // anything besides the serial lock, which is independent of method
+      // groups.
+      for (uint32_t t = serial_lock.get_htm_fastpath(); t; t--)
 	{
 	  uint32_t ret = htm_begin();
 	  if (htm_begin_success(ret))
@@ -200,9 +208,11 @@  GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
 	      // We are executing a transaction now.
 	      // Monitor the writer flag in the serial-mode lock, and abort
 	      // if there is an active or waiting serial-mode transaction.
+	      // Also checks that htm_fastpath is still nonzero and thus
+	      // HW transactions are allowed to run.
 	      // Note that this can also happen due to an enclosing
 	      // serial-mode transaction; we handle this case below.
-	      if (unlikely(serial_lock.is_write_locked()))
+	      if (unlikely(serial_lock.htm_fastpath_disabled()))
 		htm_abort();
 	      else
 		// We do not need to set a_saveLiveVariables because of HTM.
@@ -213,9 +223,12 @@  GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
 	  // retrying the transaction will be successful.
 	  if (!htm_abort_should_retry(ret))
 	    break;
+	  // Check whether the HTM fastpath has been disabled.
+	  if (!serial_lock.get_htm_fastpath())
+	    break;
 	  // Wait until any concurrent serial-mode transactions have finished.
 	  // This is an empty critical section, but won't be elided.
-	  if (serial_lock.is_write_locked())
+	  if (serial_lock.htm_fastpath_disabled())
 	    {
 	      tx = gtm_thr();
 	      if (unlikely(tx == NULL))
@@ -250,7 +263,7 @@  GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
   // HTM fastpath aborted, and that we thus have to decide whether to retry
   // the fastpath (returning a_tryHTMFastPath) or just proceed with the
   // fallback method.
-  if (likely(htm_fastpath && (prop & pr_HTMRetryableAbort)))
+  if (likely(serial_lock.get_htm_fastpath() && (prop & pr_HTMRetryableAbort)))
     {
       tx = gtm_thr();
       if (unlikely(tx == NULL))
@@ -264,13 +277,15 @@  GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
       // other fallback will use serial transactions, which don't use
       // restart_total but will reset it when committing.
       if (!(prop & pr_HTMRetriedAfterAbort))
-	tx->restart_total = htm_fastpath;
+	tx->restart_total = gtm_thread::serial_lock.get_htm_fastpath();
 
       if (--tx->restart_total > 0)
 	{
 	  // Wait until any concurrent serial-mode transactions have finished.
 	  // Essentially the same code as above.
-	  if (serial_lock.is_write_locked())
+	  if (!serial_lock.get_htm_fastpath())
+	    goto stop_custom_htm_fastpath;
+	  if (serial_lock.htm_fastpath_disabled())
 	    {
 	      if (tx->nesting > 0)
 		goto stop_custom_htm_fastpath;
@@ -668,7 +683,7 @@  _ITM_commitTransaction(void)
   // a serial-mode transaction.  If we are, then there will be no other
   // concurrent serial-mode transaction.
   // See gtm_thread::begin_transaction.
-  if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
+  if (likely(!gtm_thread::serial_lock.htm_fastpath_disabled()))
     {
       htm_commit();
       return;
@@ -684,7 +699,7 @@  _ITM_commitTransactionEH(void *exc_ptr)
 {
 #if defined(USE_HTM_FASTPATH)
   // See _ITM_commitTransaction.
-  if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
+  if (likely(!gtm_thread::serial_lock.htm_fastpath_disabled()))
     {
       htm_commit();
       return;
diff --git a/libitm/config/linux/rwlock.h b/libitm/config/linux/rwlock.h
index d9f364c..4dd1dab 100644
--- a/libitm/config/linux/rwlock.h
+++ b/libitm/config/linux/rwlock.h
@@ -41,19 +41,27 @@  struct gtm_thread;
 // read-to-write upgrades do not have a higher priority than writers.
 //
 // Do not change the layout of this class; it must remain a POD type with
-// standard layout, and the WRITERS field must be first (i.e., so the
+// standard layout, and the writers field must be first (i.e., so the
 // assembler code can assume that its address is equal to the address of the
-// respective instance of the class).
+// respective instance of the class), and htm_fastpath must be second.
 
 class gtm_rwlock
 {
-  // TODO Put futexes on different cachelines?
   std::atomic<int> writers;       // Writers' futex.
+  // We put the HTM fastpath control variable here so that HTM fastpath
+  // transactions can check efficiently whether they are allowed to run.
+  // This must be accessed atomically because threads can load this value
+  // when they are neither a registered reader nor writer (i.e., when they
+  // attempt to execute the HTM fastpath).
+  std::atomic<uint32_t> htm_fastpath;
+  // TODO Put these futexes on different cachelines?  (writers and htm_fastpath
+  // should remain on the same cacheline.
   std::atomic<int> writer_readers;// A confirmed writer waits here for readers.
   std::atomic<int> readers;       // Readers wait here for writers (iff true).
 
  public:
-  gtm_rwlock() : writers(0), writer_readers(0), readers(0) {};
+  gtm_rwlock() : writers(0), htm_fastpath(0), writer_readers(0), readers(0)
+  { }
 
   void read_lock (gtm_thread *tx);
   void read_unlock (gtm_thread *tx);
@@ -64,12 +72,28 @@  class gtm_rwlock
   bool write_upgrade (gtm_thread *tx);
   void write_upgrade_finish (gtm_thread *tx);
 
-  // Returns true iff there is a concurrent active or waiting writer.
-  // This is primarily useful for simple HyTM approaches, and the value being
-  // checked is loaded with memory_order_relaxed.
-  bool is_write_locked()
+  // Returns true iff there is a concurrent active or waiting writer, or
+  // htm_fastpath is zero. This is primarily useful for simple HyTM
+  // approaches, and the values being checked are loaded with
+  // memory_order_relaxed.
+  bool htm_fastpath_disabled ()
   {
-    return writers.load (memory_order_relaxed) != 0;
+    return writers.load (memory_order_relaxed) != 0
+	|| htm_fastpath.load (memory_order_relaxed) == 0;
+  }
+
+  // This does not need to return an exact value, hence relaxed MO is
+  // sufficient.
+  uint32_t get_htm_fastpath ()
+  {
+    return htm_fastpath.load (memory_order_relaxed);
+  }
+  // This must only be called while having acquired the write lock, and other
+  // threads do not need to load an exact value; hence relaxed MO is
+  // sufficient.
+  void set_htm_fastpath (uint32_t val)
+  {
+    htm_fastpath.store (val, memory_order_relaxed);
   }
 
  protected:
diff --git a/libitm/config/posix/rwlock.cc b/libitm/config/posix/rwlock.cc
index bf58ec0..bb09826 100644
--- a/libitm/config/posix/rwlock.cc
+++ b/libitm/config/posix/rwlock.cc
@@ -31,6 +31,7 @@  namespace GTM HIDDEN {
 
 gtm_rwlock::gtm_rwlock()
   : summary (0),
+    htm_fastpath (0),
     mutex (PTHREAD_MUTEX_INITIALIZER),
     c_readers (PTHREAD_COND_INITIALIZER),
     c_writers (PTHREAD_COND_INITIALIZER),
diff --git a/libitm/config/posix/rwlock.h b/libitm/config/posix/rwlock.h
index 81c29a6..1e74e64 100644
--- a/libitm/config/posix/rwlock.h
+++ b/libitm/config/posix/rwlock.h
@@ -46,9 +46,9 @@  struct gtm_thread;
 // read-to-write upgrades do not have a higher priority than writers.
 //
 // Do not change the layout of this class; it must remain a POD type with
-// standard layout, and the SUMMARY field must be first (i.e., so the
+// standard layout, and the summary field must be first (i.e., so the
 // assembler code can assume that its address is equal to the address of the
-// respective instance of the class).
+// respective instance of the class), and htm_fastpath must be second.
 
 class gtm_rwlock
 {
@@ -58,6 +58,13 @@  class gtm_rwlock
 
   std::atomic<unsigned int> summary;	// Bitmask of the above.
 
+  // We put the HTM fastpath control variable here so that HTM fastpath
+  // transactions can check efficiently whether they are allowed to run.
+  // This must be accessed atomically because threads can load this value
+  // when they are neither a registered reader nor writer (i.e., when they
+  // attempt to execute the HTM fastpath).
+  std::atomic<uint32_t> htm_fastpath;
+
   pthread_mutex_t mutex;	        // Held if manipulating any field.
   pthread_cond_t c_readers;	        // Readers wait here
   pthread_cond_t c_writers;	        // Writers wait here for writers
@@ -80,12 +87,28 @@  class gtm_rwlock
   bool write_upgrade (gtm_thread *tx);
   void write_upgrade_finish (gtm_thread *tx);
 
-  // Returns true iff there is a concurrent active or waiting writer.
-  // This is primarily useful for simple HyTM approaches, and the value being
-  // checked is loaded with memory_order_relaxed.
-  bool is_write_locked()
+  // Returns true iff there is a concurrent active or waiting writer, or
+  // htm_fastpath is zero. This is primarily useful for simple HyTM
+  // approaches, and the values being checked are loaded with
+  // memory_order_relaxed.
+  bool htm_fastpath_disabled ()
+  {
+    return (summary.load (memory_order_relaxed) & (a_writer | w_writer))
+	|| htm_fastpath.load (memory_order_relaxed) == 0;
+  }
+
+  // This does not need to return an exact value, hence relaxed MO is
+  // sufficient.
+  uint32_t get_htm_fastpath ()
+  {
+    return htm_fastpath.load (memory_order_relaxed);
+  }
+  // This must only be called while having acquired the write lock, and other
+  // threads do not need to load an exact value; hence relaxed MO is
+  // sufficient.
+  void set_htm_fastpath (uint32_t val)
   {
-    return summary.load (memory_order_relaxed) & (a_writer | w_writer);
+    htm_fastpath.store (val, memory_order_relaxed);
   }
 
  protected:
diff --git a/libitm/config/x86/sjlj.S b/libitm/config/x86/sjlj.S
index 4b79db7..3d2a922 100644
--- a/libitm/config/x86/sjlj.S
+++ b/libitm/config/x86/sjlj.S
@@ -81,8 +81,9 @@  SYM(_ITM_beginTransaction):
 	   back to another execution method.  RTM restores all registers after
 	   a HW transaction abort, so we can do the SW setjmp after aborts,
 	   and we have to because we might choose a SW fall back.  However,
-	   we have to explicitly save/restore the first argument (edi).  */
-	cmpl	$0, SYM(gtm_htm_fastpath)(%rip)
+	   we have to explicitly save/restore the first argument (edi).
+	   The htm_fastpath field is the second int in gtm_rwlock.  */
+	cmpl	$0, (SYM(gtm_serial_lock)+4)(%rip)
 	jz	.Lno_htm
 	testl	$pr_hasNoAbort, %edi
 	jz	.Lno_htm
@@ -95,6 +96,10 @@  SYM(_ITM_beginTransaction):
 	   this case in the retry policy implementation.  */
 	cmpl	$0, SYM(gtm_serial_lock)(%rip)
 	jnz	1f
+	/* Now also check that HW transactions are still allowed to run (see
+	   gtm_thread::begin_transaction for why this is necessary).  */
+	cmpl	$0, (SYM(gtm_serial_lock)+4)(%rip)
+	jz	1f
 	/* Everything is good.  Run the transaction, preferably using the
 	   uninstrumented code path.  Note that the following works because
 	   pr_uninstrumentedCode == a_runUninstrumentedCode.  */
@@ -102,8 +107,9 @@  SYM(_ITM_beginTransaction):
 	mov	$a_runInstrumentedCode, %eax
 	cmovnz	%edi, %eax
 	ret
-	/* There is a serial-mode transaction, so abort (see htm_abort()
-	   regarding the abort code).  */
+	/* There is a serial-mode transaction or HW transactions are not
+	   allowed anymore, so abort (see htm_abort() regarding the abort
+	   code).  */
 1:	xabort	$0xff
 .Ltxn_abort:
 	/* If it might make sense to retry the HTM fast path, let the C++
diff --git a/libitm/libitm_i.h b/libitm/libitm_i.h
index fd7c4d2..4379e1f 100644
--- a/libitm/libitm_i.h
+++ b/libitm/libitm_i.h
@@ -356,12 +356,6 @@  extern abi_dispatch *dispatch_htm();
 
 extern gtm_cacheline_mask gtm_mask_stack(gtm_cacheline *, gtm_cacheline_mask);
 
-// Control variable for the HTM fastpath that uses serial mode as fallback.
-// Non-zero if the HTM fastpath is enabled. See gtm_thread::begin_transaction.
-// Accessed from assembly language, thus the "asm" specifier on
-// the name, avoiding complex name mangling.
-extern uint32_t htm_fastpath __asm__(UPFX "gtm_htm_fastpath");
-
 } // namespace GTM
 
 #endif // LIBITM_I_H
diff --git a/libitm/method-serial.cc b/libitm/method-serial.cc
index 82d407d..7a937d6 100644
--- a/libitm/method-serial.cc
+++ b/libitm/method-serial.cc
@@ -222,13 +222,13 @@  struct htm_mg : public method_group
     // Enable the HTM fastpath if the HW is available.  The fastpath is
     // initially disabled.
 #ifdef USE_HTM_FASTPATH
-    htm_fastpath = htm_init();
+    gtm_thread::serial_lock.set_htm_fastpath(htm_init());
 #endif
   }
   virtual void fini()
   {
     // Disable the HTM fastpath.
-    htm_fastpath = 0;
+    gtm_thread::serial_lock.set_htm_fastpath(0);
   }
 };
 
@@ -288,7 +288,7 @@  GTM::gtm_thread::serialirr_mode ()
 #if defined(USE_HTM_FASTPATH)
   // HTM fastpath.  If we are executing a HW transaction, don't go serial but
   // continue.  See gtm_thread::begin_transaction.
-  if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
+  if (likely(!gtm_thread::serial_lock.htm_fastpath_disabled()))
     return;
 #endif
 
diff --git a/libitm/query.cc b/libitm/query.cc
index b7a1180..ddce846 100644
--- a/libitm/query.cc
+++ b/libitm/query.cc
@@ -49,7 +49,7 @@  _ITM_inTransaction (void)
   // a transaction and thus we can't deduce this by looking at just the serial
   // lock.  This function isn't used in practice currently, so the easiest
   // way to handle it is to just abort.
-  if (htm_fastpath && htm_transaction_active())
+  if (gtm_thread::serial_lock.get_htm_fastpath() && htm_transaction_active())
     htm_abort();
 #endif
   struct gtm_thread *tx = gtm_thr();
@@ -69,7 +69,7 @@  _ITM_getTransactionId (void)
 {
 #if defined(USE_HTM_FASTPATH)
   // See ITM_inTransaction.
-  if (htm_fastpath && htm_transaction_active())
+  if (gtm_thread::serial_lock.get_htm_fastpath() && htm_transaction_active())
     htm_abort();
 #endif
   struct gtm_thread *tx = gtm_thr();