diff mbox

libitm: Fix HTM fastpath.

Message ID 1453479369.4592.62.camel@localhost.localdomain
State New
Headers show

Commit Message

Torvald Riegel Jan. 22, 2016, 4:16 p.m. UTC
On Fri, 2016-01-08 at 12:07 +0100, Torvald Riegel wrote:
> This patch fixes a thinko in the HTM fastpath implementation.  In a
> nutshell, we also need to monitor the HTM fastpath control (ie,
> htm_fastpath) variable from within a HW transaction on the HTM fastpath,
> so that such a HW transaciton will only execute if the HTM hastpath is
> still enabled.
> 
> We move htm_fastpath into the serial lock so that a HW transaction only
> needs one cacheline of HTM capacity to monitor both htm_fastpath and
> check that no non-HW-transaction is currently running.
> 
> Tested on x86_64-linux.
> 
> 2016-01-08  Torvald Riegel  <triegel@redhat.com>
> 
> 	* beginend.cc (GTM::gtm_thread::serial_lock): Put on cacheline
> 	boundary.
> 	(htm_fastpath): Remove.
> 	(gtm_thread::begin_transaction): Fix HTM fastpath.
> 	(_ITM_commitTransaction): Adapt.
> 	(_ITM_commitTransactionEH): Adapt.
> 	* libitm/config/linux/rwlock.h (gtm_rwlock): Add htm_fastpath member
> 	and accessors.
> 	* libitm/config/posix/rwlock.h (gtm_rwlock): Likewise.
> 	* libitm/config/posix/rwlock.cc (gtm_rwlock::gtm_rwlock): Adapt.
> 	* libitm/config/x86/sjlj.S (_ITM_beginTransaction): Fix HTM fastpath.
> 	* libitm/libitm_i.h (htm_fastpath): Remove declaration.
> 	* libitm/method-serial.cc (htm_mg): Adapt.
> 	(gtm_thread::serialirr_mode): Adapt.
> 	* libitm/query.cc (_ITM_inTransaction, _ITM_getTransactionId): Adapt.

I have committed the attached patch (a minor rebase compared to the
prior one) after offline approval by Jakub Jelinek.  He tested on PPC as
well, and the patch fixed the problem we saw there.
diff mbox

Patch

commit 51a5f38f0228ed7e4772bf1d0439f93ab4ffcf23
Author: torvald <torvald@138bc75d-0d04-0410-961f-82ee72b054a4>
Date:   Fri Jan 22 16:13:06 2016 +0000

    libitm: Fix HTM fastpath.
    
    	* beginend.cc (GTM::gtm_thread::serial_lock): Put on cacheline
    	boundary.
    	(htm_fastpath): Remove.
    	(gtm_thread::begin_transaction): Fix HTM fastpath.
    	(_ITM_commitTransaction): Adapt.
    	(_ITM_commitTransactionEH): Adapt.
    	* libitm/config/linux/rwlock.h (gtm_rwlock): Add htm_fastpath member
    	and accessors.
    	* libitm/config/posix/rwlock.h (gtm_rwlock): Likewise.
    	* libitm/config/posix/rwlock.cc (gtm_rwlock::gtm_rwlock): Adapt.
    	* libitm/config/x86/sjlj.S (_ITM_beginTransaction): Fix HTM fastpath.
    	* libitm/libitm_i.h (htm_fastpath): Remove declaration.
    	* libitm/method-serial.cc (htm_mg): Adapt.
    	(gtm_thread::serialirr_mode): Adapt.
    	* libitm/query.cc (_ITM_inTransaction, _ITM_getTransactionId): Adapt.
    
    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@232735 138bc75d-0d04-0410-961f-82ee72b054a4

diff --git a/libitm/beginend.cc b/libitm/beginend.cc
index 00d28f4..1a258ad 100644
--- a/libitm/beginend.cc
+++ b/libitm/beginend.cc
@@ -32,7 +32,11 @@  using namespace GTM;
 extern __thread gtm_thread_tls _gtm_thr_tls;
 #endif
 
-gtm_rwlock GTM::gtm_thread::serial_lock;
+// Put this at the start of a cacheline so that serial_lock's writers and
+// htm_fastpath fields are on the same cacheline, so that HW transactions
+// only have to pay one cacheline capacity to monitor both.
+gtm_rwlock GTM::gtm_thread::serial_lock
+  __attribute__((aligned(HW_CACHELINE_SIZE)));
 gtm_thread *GTM::gtm_thread::list_of_threads = 0;
 unsigned GTM::gtm_thread::number_of_threads = 0;
 
@@ -51,9 +55,6 @@  static pthread_mutex_t global_tid_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_key_t thr_release_key;
 static pthread_once_t thr_release_once = PTHREAD_ONCE_INIT;
 
-// See gtm_thread::begin_transaction.
-uint32_t GTM::htm_fastpath = 0;
-
 /* Allocate a transaction structure.  */
 void *
 GTM::gtm_thread::operator new (size_t s)
@@ -173,9 +174,11 @@  GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
   // lock's writer flag and thus abort if another thread is or becomes a
   // serial transaction.  Therefore, if the fastpath is enabled, then a
   // transaction is not executing as a HW transaction iff the serial lock is
-  // write-locked.  This allows us to use htm_fastpath and the serial lock's
-  // writer flag to reliable determine whether the current thread runs a HW
-  // transaction, and thus we do not need to maintain this information in
+  // write-locked.  Also, HW transactions monitor the fastpath control
+  // variable, so that they will only execute if dispatch_htm is still the
+  // current method group.  This allows us to use htm_fastpath and the serial
+  // lock's writers flag to reliable determine whether the current thread runs
+  // a HW transaction, and thus we do not need to maintain this information in
   // per-thread state.
   // If an uninstrumented code path is not available, we can still run
   // instrumented code from a HW transaction because the HTM fastpath kicks
@@ -187,9 +190,14 @@  GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
   // for any internal changes (e.g., they never abort visibly to the STM code
   // and thus do not trigger the standard retry handling).
 #ifndef HTM_CUSTOM_FASTPATH
-  if (likely(htm_fastpath && (prop & pr_hasNoAbort)))
+  if (likely(serial_lock.get_htm_fastpath() && (prop & pr_hasNoAbort)))
     {
-      for (uint32_t t = htm_fastpath; t; t--)
+      // Note that the snapshot of htm_fastpath that we take here could be
+      // outdated, and a different method group than dispatch_htm may have
+      // been chosen in the meantime.  Therefore, take care not not touch
+      // anything besides the serial lock, which is independent of method
+      // groups.
+      for (uint32_t t = serial_lock.get_htm_fastpath(); t; t--)
 	{
 	  uint32_t ret = htm_begin();
 	  if (htm_begin_success(ret))
@@ -197,9 +205,11 @@  GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
 	      // We are executing a transaction now.
 	      // Monitor the writer flag in the serial-mode lock, and abort
 	      // if there is an active or waiting serial-mode transaction.
+	      // Also checks that htm_fastpath is still nonzero and thus
+	      // HW transactions are allowed to run.
 	      // Note that this can also happen due to an enclosing
 	      // serial-mode transaction; we handle this case below.
-	      if (unlikely(serial_lock.is_write_locked()))
+	      if (unlikely(serial_lock.htm_fastpath_disabled()))
 		htm_abort();
 	      else
 		// We do not need to set a_saveLiveVariables because of HTM.
@@ -210,9 +220,12 @@  GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
 	  // retrying the transaction will be successful.
 	  if (!htm_abort_should_retry(ret))
 	    break;
+	  // Check whether the HTM fastpath has been disabled.
+	  if (!serial_lock.get_htm_fastpath())
+	    break;
 	  // Wait until any concurrent serial-mode transactions have finished.
 	  // This is an empty critical section, but won't be elided.
-	  if (serial_lock.is_write_locked())
+	  if (serial_lock.htm_fastpath_disabled())
 	    {
 	      tx = gtm_thr();
 	      if (unlikely(tx == NULL))
@@ -247,7 +260,7 @@  GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
   // HTM fastpath aborted, and that we thus have to decide whether to retry
   // the fastpath (returning a_tryHTMFastPath) or just proceed with the
   // fallback method.
-  if (likely(htm_fastpath && (prop & pr_HTMRetryableAbort)))
+  if (likely(serial_lock.get_htm_fastpath() && (prop & pr_HTMRetryableAbort)))
     {
       tx = gtm_thr();
       if (unlikely(tx == NULL))
@@ -261,13 +274,15 @@  GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
       // other fallback will use serial transactions, which don't use
       // restart_total but will reset it when committing.
       if (!(prop & pr_HTMRetriedAfterAbort))
-	tx->restart_total = htm_fastpath;
+	tx->restart_total = gtm_thread::serial_lock.get_htm_fastpath();
 
       if (--tx->restart_total > 0)
 	{
 	  // Wait until any concurrent serial-mode transactions have finished.
 	  // Essentially the same code as above.
-	  if (serial_lock.is_write_locked())
+	  if (!serial_lock.get_htm_fastpath())
+	    goto stop_custom_htm_fastpath;
+	  if (serial_lock.htm_fastpath_disabled())
 	    {
 	      if (tx->nesting > 0)
 		goto stop_custom_htm_fastpath;
@@ -691,7 +706,7 @@  _ITM_commitTransaction(void)
   // a serial-mode transaction.  If we are, then there will be no other
   // concurrent serial-mode transaction.
   // See gtm_thread::begin_transaction.
-  if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
+  if (likely(!gtm_thread::serial_lock.htm_fastpath_disabled()))
     {
       htm_commit();
       return;
@@ -707,7 +722,7 @@  _ITM_commitTransactionEH(void *exc_ptr)
 {
 #if defined(USE_HTM_FASTPATH)
   // See _ITM_commitTransaction.
-  if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
+  if (likely(!gtm_thread::serial_lock.htm_fastpath_disabled()))
     {
       htm_commit();
       return;
diff --git a/libitm/config/linux/rwlock.h b/libitm/config/linux/rwlock.h
index d9f364c..4dd1dab 100644
--- a/libitm/config/linux/rwlock.h
+++ b/libitm/config/linux/rwlock.h
@@ -41,19 +41,27 @@  struct gtm_thread;
 // read-to-write upgrades do not have a higher priority than writers.
 //
 // Do not change the layout of this class; it must remain a POD type with
-// standard layout, and the WRITERS field must be first (i.e., so the
+// standard layout, and the writers field must be first (i.e., so the
 // assembler code can assume that its address is equal to the address of the
-// respective instance of the class).
+// respective instance of the class), and htm_fastpath must be second.
 
 class gtm_rwlock
 {
-  // TODO Put futexes on different cachelines?
   std::atomic<int> writers;       // Writers' futex.
+  // We put the HTM fastpath control variable here so that HTM fastpath
+  // transactions can check efficiently whether they are allowed to run.
+  // This must be accessed atomically because threads can load this value
+  // when they are neither a registered reader nor writer (i.e., when they
+  // attempt to execute the HTM fastpath).
+  std::atomic<uint32_t> htm_fastpath;
+  // TODO Put these futexes on different cachelines?  (writers and htm_fastpath
+  // should remain on the same cacheline.
   std::atomic<int> writer_readers;// A confirmed writer waits here for readers.
   std::atomic<int> readers;       // Readers wait here for writers (iff true).
 
  public:
-  gtm_rwlock() : writers(0), writer_readers(0), readers(0) {};
+  gtm_rwlock() : writers(0), htm_fastpath(0), writer_readers(0), readers(0)
+  { }
 
   void read_lock (gtm_thread *tx);
   void read_unlock (gtm_thread *tx);
@@ -64,12 +72,28 @@  class gtm_rwlock
   bool write_upgrade (gtm_thread *tx);
   void write_upgrade_finish (gtm_thread *tx);
 
-  // Returns true iff there is a concurrent active or waiting writer.
-  // This is primarily useful for simple HyTM approaches, and the value being
-  // checked is loaded with memory_order_relaxed.
-  bool is_write_locked()
+  // Returns true iff there is a concurrent active or waiting writer, or
+  // htm_fastpath is zero. This is primarily useful for simple HyTM
+  // approaches, and the values being checked are loaded with
+  // memory_order_relaxed.
+  bool htm_fastpath_disabled ()
   {
-    return writers.load (memory_order_relaxed) != 0;
+    return writers.load (memory_order_relaxed) != 0
+	|| htm_fastpath.load (memory_order_relaxed) == 0;
+  }
+
+  // This does not need to return an exact value, hence relaxed MO is
+  // sufficient.
+  uint32_t get_htm_fastpath ()
+  {
+    return htm_fastpath.load (memory_order_relaxed);
+  }
+  // This must only be called while having acquired the write lock, and other
+  // threads do not need to load an exact value; hence relaxed MO is
+  // sufficient.
+  void set_htm_fastpath (uint32_t val)
+  {
+    htm_fastpath.store (val, memory_order_relaxed);
   }
 
  protected:
diff --git a/libitm/config/posix/rwlock.cc b/libitm/config/posix/rwlock.cc
index 1e1eea8..86f7105 100644
--- a/libitm/config/posix/rwlock.cc
+++ b/libitm/config/posix/rwlock.cc
@@ -31,6 +31,7 @@  namespace GTM HIDDEN {
 
 gtm_rwlock::gtm_rwlock()
   : summary (0),
+    htm_fastpath (0),
     mutex (PTHREAD_MUTEX_INITIALIZER),
     c_readers (PTHREAD_COND_INITIALIZER),
     c_writers (PTHREAD_COND_INITIALIZER),
diff --git a/libitm/config/posix/rwlock.h b/libitm/config/posix/rwlock.h
index 81c29a6..1e74e64 100644
--- a/libitm/config/posix/rwlock.h
+++ b/libitm/config/posix/rwlock.h
@@ -46,9 +46,9 @@  struct gtm_thread;
 // read-to-write upgrades do not have a higher priority than writers.
 //
 // Do not change the layout of this class; it must remain a POD type with
-// standard layout, and the SUMMARY field must be first (i.e., so the
+// standard layout, and the summary field must be first (i.e., so the
 // assembler code can assume that its address is equal to the address of the
-// respective instance of the class).
+// respective instance of the class), and htm_fastpath must be second.
 
 class gtm_rwlock
 {
@@ -58,6 +58,13 @@  class gtm_rwlock
 
   std::atomic<unsigned int> summary;	// Bitmask of the above.
 
+  // We put the HTM fastpath control variable here so that HTM fastpath
+  // transactions can check efficiently whether they are allowed to run.
+  // This must be accessed atomically because threads can load this value
+  // when they are neither a registered reader nor writer (i.e., when they
+  // attempt to execute the HTM fastpath).
+  std::atomic<uint32_t> htm_fastpath;
+
   pthread_mutex_t mutex;	        // Held if manipulating any field.
   pthread_cond_t c_readers;	        // Readers wait here
   pthread_cond_t c_writers;	        // Writers wait here for writers
@@ -80,12 +87,28 @@  class gtm_rwlock
   bool write_upgrade (gtm_thread *tx);
   void write_upgrade_finish (gtm_thread *tx);
 
-  // Returns true iff there is a concurrent active or waiting writer.
-  // This is primarily useful for simple HyTM approaches, and the value being
-  // checked is loaded with memory_order_relaxed.
-  bool is_write_locked()
+  // Returns true iff there is a concurrent active or waiting writer, or
+  // htm_fastpath is zero. This is primarily useful for simple HyTM
+  // approaches, and the values being checked are loaded with
+  // memory_order_relaxed.
+  bool htm_fastpath_disabled ()
+  {
+    return (summary.load (memory_order_relaxed) & (a_writer | w_writer))
+	|| htm_fastpath.load (memory_order_relaxed) == 0;
+  }
+
+  // This does not need to return an exact value, hence relaxed MO is
+  // sufficient.
+  uint32_t get_htm_fastpath ()
+  {
+    return htm_fastpath.load (memory_order_relaxed);
+  }
+  // This must only be called while having acquired the write lock, and other
+  // threads do not need to load an exact value; hence relaxed MO is
+  // sufficient.
+  void set_htm_fastpath (uint32_t val)
   {
-    return summary.load (memory_order_relaxed) & (a_writer | w_writer);
+    htm_fastpath.store (val, memory_order_relaxed);
   }
 
  protected:
diff --git a/libitm/config/x86/sjlj.S b/libitm/config/x86/sjlj.S
index 4b79db7..3d2a922 100644
--- a/libitm/config/x86/sjlj.S
+++ b/libitm/config/x86/sjlj.S
@@ -81,8 +81,9 @@  SYM(_ITM_beginTransaction):
 	   back to another execution method.  RTM restores all registers after
 	   a HW transaction abort, so we can do the SW setjmp after aborts,
 	   and we have to because we might choose a SW fall back.  However,
-	   we have to explicitly save/restore the first argument (edi).  */
-	cmpl	$0, SYM(gtm_htm_fastpath)(%rip)
+	   we have to explicitly save/restore the first argument (edi).
+	   The htm_fastpath field is the second int in gtm_rwlock.  */
+	cmpl	$0, (SYM(gtm_serial_lock)+4)(%rip)
 	jz	.Lno_htm
 	testl	$pr_hasNoAbort, %edi
 	jz	.Lno_htm
@@ -95,6 +96,10 @@  SYM(_ITM_beginTransaction):
 	   this case in the retry policy implementation.  */
 	cmpl	$0, SYM(gtm_serial_lock)(%rip)
 	jnz	1f
+	/* Now also check that HW transactions are still allowed to run (see
+	   gtm_thread::begin_transaction for why this is necessary).  */
+	cmpl	$0, (SYM(gtm_serial_lock)+4)(%rip)
+	jz	1f
 	/* Everything is good.  Run the transaction, preferably using the
 	   uninstrumented code path.  Note that the following works because
 	   pr_uninstrumentedCode == a_runUninstrumentedCode.  */
@@ -102,8 +107,9 @@  SYM(_ITM_beginTransaction):
 	mov	$a_runInstrumentedCode, %eax
 	cmovnz	%edi, %eax
 	ret
-	/* There is a serial-mode transaction, so abort (see htm_abort()
-	   regarding the abort code).  */
+	/* There is a serial-mode transaction or HW transactions are not
+	   allowed anymore, so abort (see htm_abort() regarding the abort
+	   code).  */
 1:	xabort	$0xff
 .Ltxn_abort:
 	/* If it might make sense to retry the HTM fast path, let the C++
diff --git a/libitm/libitm_i.h b/libitm/libitm_i.h
index ae88ff0..a8cff57 100644
--- a/libitm/libitm_i.h
+++ b/libitm/libitm_i.h
@@ -344,11 +344,6 @@  extern abi_dispatch *dispatch_gl_wt();
 extern abi_dispatch *dispatch_ml_wt();
 extern abi_dispatch *dispatch_htm();
 
-// Control variable for the HTM fastpath that uses serial mode as fallback.
-// Non-zero if the HTM fastpath is enabled. See gtm_thread::begin_transaction.
-// Accessed from assembly language, thus the "asm" specifier on
-// the name, avoiding complex name mangling.
-extern uint32_t htm_fastpath __asm__(UPFX "gtm_htm_fastpath");
 
 } // namespace GTM
 
diff --git a/libitm/method-serial.cc b/libitm/method-serial.cc
index 1123e34..a151b66 100644
--- a/libitm/method-serial.cc
+++ b/libitm/method-serial.cc
@@ -226,13 +226,13 @@  struct htm_mg : public method_group
     // Enable the HTM fastpath if the HW is available.  The fastpath is
     // initially disabled.
 #ifdef USE_HTM_FASTPATH
-    htm_fastpath = htm_init();
+    gtm_thread::serial_lock.set_htm_fastpath(htm_init());
 #endif
   }
   virtual void fini()
   {
     // Disable the HTM fastpath.
-    htm_fastpath = 0;
+    gtm_thread::serial_lock.set_htm_fastpath(0);
   }
 };
 
@@ -292,7 +292,7 @@  GTM::gtm_thread::serialirr_mode ()
 #if defined(USE_HTM_FASTPATH)
   // HTM fastpath.  If we are executing a HW transaction, don't go serial but
   // continue.  See gtm_thread::begin_transaction.
-  if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
+  if (likely(!gtm_thread::serial_lock.htm_fastpath_disabled()))
     return;
 #endif
 
diff --git a/libitm/query.cc b/libitm/query.cc
index b7a1180..ddce846 100644
--- a/libitm/query.cc
+++ b/libitm/query.cc
@@ -49,7 +49,7 @@  _ITM_inTransaction (void)
   // a transaction and thus we can't deduce this by looking at just the serial
   // lock.  This function isn't used in practice currently, so the easiest
   // way to handle it is to just abort.
-  if (htm_fastpath && htm_transaction_active())
+  if (gtm_thread::serial_lock.get_htm_fastpath() && htm_transaction_active())
     htm_abort();
 #endif
   struct gtm_thread *tx = gtm_thr();
@@ -69,7 +69,7 @@  _ITM_getTransactionId (void)
 {
 #if defined(USE_HTM_FASTPATH)
   // See ITM_inTransaction.
-  if (htm_fastpath && htm_transaction_active())
+  if (gtm_thread::serial_lock.get_htm_fastpath() && htm_transaction_active())
     htm_abort();
 #endif
   struct gtm_thread *tx = gtm_thr();