diff mbox

[trans-mem] reduce contention caused by transaction id

Message ID 4D0772B6.5070407@redhat.com
State New
Headers show

Commit Message

Richard Henderson Dec. 14, 2010, 1:35 p.m. UTC
Having to ping-pong a cacheline between threads upon creation of any
transaction is quite wasteful.  We can essentially eliminate this
contention by allocating 2**N IDs to the thread at once.

Here I use 2**16, which means that we have 2**48 such blocks to
allocate between threads.  That ought to be enough such that even a
badly behaving application that continuously creates new threads 
cannot quickly exhaust the supply.

Idea courtesy of Torvald Riegel.


r~
* beginend.cc (GTM::gtm_transaction::begin_transaction): Allocate
	blocks of TIDs per thread.
	* config/generic/tls.h (struct gtm_thread): Add local_tid member.
	(setup_gtm_thr): Return the thread structure.
	* config/x86/tls.h (setup_gtm_thr): Likewise.
diff mbox

Patch

Index: beginend.cc
===================================================================
--- beginend.cc	(revision 167789)
+++ beginend.cc	(working copy)
@@ -91,11 +91,13 @@ 
 uint32_t
 GTM::gtm_transaction::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
 {
+  static const _ITM_transactionId_t tid_block_size = 1 << 16;
+
   gtm_transaction *tx;
   gtm_dispatch *disp;
   uint32_t ret;
 
-  setup_gtm_thr ();
+  gtm_thread *thr = setup_gtm_thr ();
 
   tx = new gtm_transaction;
 
@@ -103,13 +105,25 @@ 
   tx->prev = gtm_tx();
   if (tx->prev)
     tx->nesting = tx->prev->nesting + 1;
+
+  // As long as we have not exhausted a previously allocated block of TIDs,
+  // we can avoid an atomic operation on a shared cacheline.
+  if (thr->local_tid & (tid_block_size - 1))
+    tx->id = thr->local_tid++;
+  else
+    {
 #ifdef HAVE_64BIT_SYNC_BUILTINS
-  tx->id = __sync_add_and_fetch (&global_tid, 1);
+      tx->id = __sync_add_and_fetch (&global_tid, tid_block_size);
+      thr->local_tid = tx->id + 1;
 #else
-  pthread_mutex_lock (&global_tid_lock);
-  tx->id = ++global_tid;
-  pthread_mutex_unlock (&global_tid_lock);
+      pthread_mutex_lock (&global_tid_lock);
+      global_tid += tid_block_size;
+      tx->id = global_tid;
+      thr->local_tid = tx->id + 1;
+      pthread_mutex_unlock (&global_tid_lock);
 #endif
+    }
+
   tx->jb = *jb;
 
   set_gtm_tx (tx);
Index: config/x86/tls.h
===================================================================
--- config/x86/tls.h	(revision 167790)
+++ config/x86/tls.h	(working copy)
@@ -65,10 +65,15 @@ 
   return r;
 }
 
-static inline void setup_gtm_thr(void)
+static inline struct gtm_thread *setup_gtm_thr(void)
 {
-  if (gtm_thr() == NULL)
-    asm volatile (SEG_WRITE(10) : : "r"(&_gtm_thr));
+  gtm_thread *thr = gtm_thr();
+  if (thr == NULL)
+    {
+      thr = &_gtm_thr;
+      asm volatile (SEG_WRITE(10) : : "r"(thr));
+    }
+  return thr;
 }
 
 static inline struct gtm_transaction * gtm_tx(void)
Index: config/generic/tls.h
===================================================================
--- config/generic/tls.h	(revision 167789)
+++ config/generic/tls.h	(working copy)
@@ -50,6 +50,12 @@ 
   void *free_tx[MAX_FREE_TX];
   unsigned free_tx_idx, free_tx_count;
 
+  // In order to reduce cacheline contention on global_tid during
+  // beginTransaction, we allocate a block of 2**N ids to the thread
+  // all at once.  This number is the next value to be allocated from
+  // the block, or 0 % 2**N if no such block is allocated.
+  _ITM_transactionId_t local_tid;
+
   // The value returned by _ITM_getThreadnum to identify this thread.
   // ??? At present, this is densely allocated beginning with 1 and
   // we don't bother filling in this value until it is requested.
@@ -67,7 +73,7 @@ 
 #ifndef HAVE_ARCH_GTM_THREAD
 // If the target does not provide optimized access to the thread-local
 // data, simply access the TLS variable defined above.
-static inline void setup_gtm_thr() { }
+static inline gtm_thread *setup_gtm_thr() { return &_gtm_thr; }
 static inline gtm_thread *gtm_thr() { return &_gtm_thr; }
 #endif