diff mbox

[v3,1/2] mbcache: decoupling the locking of local from global data

Message ID 1378312756-68597-2-git-send-email-tmac@hp.com
State Changes Requested
Headers show

Commit Message

T Makphaibulchoke Sept. 4, 2013, 4:39 p.m. UTC
The patch increases the parallelism of mb_cache_entry utilization by
replacing list_head with hlist_bl_node for the implementation of both the
block and index hash tables.  Each hlist_bl_node contains a built-in lock
used to protect mb_cache's local block and index hash chains. The global
data mb_cache_lru_list and mb_cache_list continue to be protected by the
global mb_cache_spinlock.

Signed-off-by: T. Makphaibulchoke <tmac@hp.com>
---
Changed in v3:
	- Removed all hash lock macros.
	- Fixed a possible race condition updating the e_used and
	  e_queued members of an mb_cache_entry between mb_cache_entry_get
	  function, traversing an block hash chain, and mb_cache_entry_find
	  function, traversing an index hash chain.

Changed in v2:
	- As per Linus Torvalds' suggestion, instead of allocating spinlock
	  arrays to protect the hash chains, use hlist_bl_head, which already
	  contains builtin lock.

 fs/mbcache.c            | 306 +++++++++++++++++++++++++++++++++++-------------
 include/linux/mbcache.h |  10 +-
 2 files changed, 229 insertions(+), 87 deletions(-)

Comments

Theodore Ts'o Oct. 30, 2013, 2:42 p.m. UTC | #1
On Wed, Sep 04, 2013 at 10:39:15AM -0600, T Makphaibulchoke wrote:
> The patch increases the parallelism of mb_cache_entry utilization by
> replacing list_head with hlist_bl_node for the implementation of both the
> block and index hash tables.  Each hlist_bl_node contains a built-in lock
> used to protect mb_cache's local block and index hash chains. The global
> data mb_cache_lru_list and mb_cache_list continue to be protected by the
> global mb_cache_spinlock.
> 
> Signed-off-by: T. Makphaibulchoke <tmac@hp.com>

I tried running xfstests with this patch, and it blew up on
generic/020 test:

generic/020	[10:21:50][  105.170352] ------------[ cut here ]------------
[  105.171683] kernel BUG at /usr/projects/linux/ext4/include/linux/bit_spinlock.h:76!
[  105.173346] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC
[  105.173346] Modules linked in:
[  105.173346] CPU: 1 PID: 8519 Comm: attr Not tainted 3.12.0-rc5-00008-gffbe1d7-dirty #1492
[  105.173346] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[  105.173346] task: f5abe560 ti: f2274000 task.ti: f2274000
[  105.173346] EIP: 0060:[<c026b464>] EFLAGS: 00010246 CPU: 1
[  105.173346] EIP is at hlist_bl_unlock+0x7/0x1c
[  105.173346] EAX: f488d360 EBX: f488d360 ECX: 00000000 EDX: f2998800
[  105.173346] ESI: f29987f0 EDI: 6954c848 EBP: f2275cc8 ESP: f2275cb8
[  105.173346]  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
[  105.173346] CR0: 80050033 CR2: b76bcf54 CR3: 34844000 CR4: 000006f0
[  105.173346] Stack:
[  105.173346]  c026bc78 f2275d48 6954c848 f29987f0 f2275d24 c02cd7a9 f2275ce4 c02e2881
[  105.173346]  f255d8c8 00000000 f1109020 f4a67f00 f2275d54 f2275d08 c02cd020 6954c848
[  105.173346]  f4a67f00 f1109000 f2b0eba8 f2ee3800 f2275d28 f4f811e8 f2275d38 00000000
[  105.173346] Call Trace:
[  105.173346]  [<c026bc78>] ? mb_cache_entry_find_first+0x4b/0x55
[  105.173346]  [<c02cd7a9>] ext4_xattr_block_set+0x248/0x6e7
[  105.173346]  [<c02e2881>] ? jbd2_journal_put_journal_head+0xe2/0xed
[  105.173346]  [<c02cd020>] ? ext4_xattr_find_entry+0x52/0xac
[  105.173346]  [<c02ce307>] ext4_xattr_set_handle+0x1c7/0x30f
[  105.173346]  [<c02ce4f4>] ext4_xattr_set+0xa5/0xe1
[  105.173346]  [<c02ceb36>] ext4_xattr_user_set+0x46/0x5f
[  105.173346]  [<c024a4da>] generic_setxattr+0x4c/0x5e
[  105.173346]  [<c024a48e>] ? generic_listxattr+0x95/0x95
[  105.173346]  [<c024ab0f>] __vfs_setxattr_noperm+0x56/0xb6
[  105.173346]  [<c024abd2>] vfs_setxattr+0x63/0x7e
[  105.173346]  [<c024ace8>] setxattr+0xfb/0x139
[  105.173346]  [<c01b200a>] ? __lock_acquire+0x540/0xca6
[  105.173346]  [<c01877a3>] ? lg_local_unlock+0x1b/0x34
[  105.173346]  [<c01af8dd>] ? trace_hardirqs_off_caller+0x2e/0x98
[  105.173346]  [<c0227e69>] ? kmem_cache_free+0xd4/0x149
[  105.173346]  [<c01b2c2b>] ? lock_acquire+0xdd/0x107
[  105.173346]  [<c023225e>] ? __sb_start_write+0xee/0x11d
[  105.173346]  [<c0247383>] ? mnt_want_write+0x1e/0x3e
[  105.173346]  [<c01b3019>] ? trace_hardirqs_on_caller+0x12a/0x17e
[  105.173346]  [<c0247353>] ? __mnt_want_write+0x4e/0x60
[  105.173346]  [<c024af3b>] SyS_lsetxattr+0x6a/0x9f
[  105.173346]  [<c078d0e8>] syscall_call+0x7/0xb
[  105.173346] Code: 00 00 00 00 5b 5d c3 55 89 e5 53 3e 8d 74 26 00 8b 58 08 89 c2 8b 43 18 e8 3f c9 fb ff f0 ff 4b 0c 5b 5d c3 8b 10 80 e2 01 75 02 <0f> 0b 55 89 e5 0f ba 30 00 89 e0 25 00 e0 ff ff ff 48 14 5d c3
[  105.173346] EIP: [<c026b464>] hlist_bl_unlock+0x7/0x1c SS:ESP 0068:f2275cb8
[  105.273781] ---[ end trace 1ee45ddfc1df0935 ]---

When I tried to find a potential problem, I immediately ran into this.
I'm not entirely sure it's the problem, but it's raised a number of
red flags for me in terms of (a) how much testing you've employed with
this patch set, and (b) how maintaining and easy-to-audit the code
will be with this extra locking.  The comments are good start, but
some additional comments about exactly what assumptions a function
assumes about locks that are held on function entry, or especially if
the locking is different on function entry and function exit, might
make it easier for people to audit this patch.

Or maybe this commit needs to be split up with first a conversion from
using list_head to hlist_hl_node, and the changing the locking?  The
bottom line is that we need to somehow make this patch easier to
validate/review.

> @@ -520,18 +647,23 @@ __mb_cache_entry_find(struct list_head *l, struct list_head *head,
>  				ce->e_queued++;
>  				prepare_to_wait(&mb_cache_queue, &wait,
>  						TASK_UNINTERRUPTIBLE);
> -				spin_unlock(&mb_cache_spinlock);
> +				hlist_bl_unlock(head);
>  				schedule();
> -				spin_lock(&mb_cache_spinlock);
> +				hlist_bl_lock(head);
> +				mb_assert(ce->e_index_hash_p == head);
>  				ce->e_queued--;
>  			}
> +			hlist_bl_unlock(head);
>  			finish_wait(&mb_cache_queue, &wait);
>  
> -			if (!__mb_cache_entry_is_hashed(ce)) {
> +			hlist_bl_lock(ce->e_block_hash_p);
> +			if (!__mb_cache_entry_is_block_hashed(ce)) {
>  				__mb_cache_entry_release_unlock(ce);
> -				spin_lock(&mb_cache_spinlock);
> +				hlist_bl_lock(head);

<---  are we missing a "hlist_bl_unlock(ce->e_block_hash_p);" here?

<---- Why is it that we call "hlist_bl_lock(head);" but not in the else clause?

>  				return ERR_PTR(-EAGAIN);
> -			}
> +			} else
> +				hlist_bl_unlock(ce->e_block_hash_p);
> +			mb_assert(ce->e_index_hash_p == head);
>  			return ce;
>  		}
>  		l = l->next; 


Cheers,

					- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thavatchai Makphaibulchoke Oct. 30, 2013, 5:32 p.m. UTC | #2
On 10/30/2013 08:42 AM, Theodore Ts'o wrote:
> I tried running xfstests with this patch, and it blew up on
> generic/020 test:
> 
> generic/020	[10:21:50][  105.170352] ------------[ cut here ]------------
> [  105.171683] kernel BUG at /usr/projects/linux/ext4/include/linux/bit_spinlock.h:76!
> [  105.173346] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC
> [  105.173346] Modules linked in:
> [  105.173346] CPU: 1 PID: 8519 Comm: attr Not tainted 3.12.0-rc5-00008-gffbe1d7-dirty #1492
> [  105.173346] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> [  105.173346] task: f5abe560 ti: f2274000 task.ti: f2274000
> [  105.173346] EIP: 0060:[<c026b464>] EFLAGS: 00010246 CPU: 1
> [  105.173346] EIP is at hlist_bl_unlock+0x7/0x1c
> [  105.173346] EAX: f488d360 EBX: f488d360 ECX: 00000000 EDX: f2998800
> [  105.173346] ESI: f29987f0 EDI: 6954c848 EBP: f2275cc8 ESP: f2275cb8
> [  105.173346]  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
> [  105.173346] CR0: 80050033 CR2: b76bcf54 CR3: 34844000 CR4: 000006f0
> [  105.173346] Stack:
> [  105.173346]  c026bc78 f2275d48 6954c848 f29987f0 f2275d24 c02cd7a9 f2275ce4 c02e2881
> [  105.173346]  f255d8c8 00000000 f1109020 f4a67f00 f2275d54 f2275d08 c02cd020 6954c848
> [  105.173346]  f4a67f00 f1109000 f2b0eba8 f2ee3800 f2275d28 f4f811e8 f2275d38 00000000
> [  105.173346] Call Trace:
> [  105.173346]  [<c026bc78>] ? mb_cache_entry_find_first+0x4b/0x55
> [  105.173346]  [<c02cd7a9>] ext4_xattr_block_set+0x248/0x6e7
> [  105.173346]  [<c02e2881>] ? jbd2_journal_put_journal_head+0xe2/0xed
> [  105.173346]  [<c02cd020>] ? ext4_xattr_find_entry+0x52/0xac
> [  105.173346]  [<c02ce307>] ext4_xattr_set_handle+0x1c7/0x30f
> [  105.173346]  [<c02ce4f4>] ext4_xattr_set+0xa5/0xe1
> [  105.173346]  [<c02ceb36>] ext4_xattr_user_set+0x46/0x5f
> [  105.173346]  [<c024a4da>] generic_setxattr+0x4c/0x5e
> [  105.173346]  [<c024a48e>] ? generic_listxattr+0x95/0x95
> [  105.173346]  [<c024ab0f>] __vfs_setxattr_noperm+0x56/0xb6
> [  105.173346]  [<c024abd2>] vfs_setxattr+0x63/0x7e
> [  105.173346]  [<c024ace8>] setxattr+0xfb/0x139
> [  105.173346]  [<c01b200a>] ? __lock_acquire+0x540/0xca6
> [  105.173346]  [<c01877a3>] ? lg_local_unlock+0x1b/0x34
> [  105.173346]  [<c01af8dd>] ? trace_hardirqs_off_caller+0x2e/0x98
> [  105.173346]  [<c0227e69>] ? kmem_cache_free+0xd4/0x149
> [  105.173346]  [<c01b2c2b>] ? lock_acquire+0xdd/0x107
> [  105.173346]  [<c023225e>] ? __sb_start_write+0xee/0x11d
> [  105.173346]  [<c0247383>] ? mnt_want_write+0x1e/0x3e
> [  105.173346]  [<c01b3019>] ? trace_hardirqs_on_caller+0x12a/0x17e
> [  105.173346]  [<c0247353>] ? __mnt_want_write+0x4e/0x60
> [  105.173346]  [<c024af3b>] SyS_lsetxattr+0x6a/0x9f
> [  105.173346]  [<c078d0e8>] syscall_call+0x7/0xb
> [  105.173346] Code: 00 00 00 00 5b 5d c3 55 89 e5 53 3e 8d 74 26 00 8b 58 08 89 c2 8b 43 18 e8 3f c9 fb ff f0 ff 4b 0c 5b 5d c3 8b 10 80 e2 01 75 02 <0f> 0b 55 89 e5 0f ba 30 00 89 e0 25 00 e0 ff ff ff 48 14 5d c3
> [  105.173346] EIP: [<c026b464>] hlist_bl_unlock+0x7/0x1c SS:ESP 0068:f2275cb8
> [  105.273781] ---[ end trace 1ee45ddfc1df0935 ]---
> 
> When I tried to find a potential problem, I immediately ran into this.
> I'm not entirely sure it's the problem, but it's raised a number of
> red flags for me in terms of (a) how much testing you've employed with
> this patch set, and (b) how maintaining and easy-to-audit the code
> will be with this extra locking.  The comments are good start, but
> some additional comments about exactly what assumptions a function
> assumes about locks that are held on function entry, or especially if
> the locking is different on function entry and function exit, might
> make it easier for people to audit this patch.
> 
> Or maybe this commit needs to be split up with first a conversion from
> using list_head to hlist_hl_node, and the changing the locking?  The
> bottom line is that we need to somehow make this patch easier to
> validate/review.
> 

Thanks for the comemnts.  Yes, I did run through xfstests.  My guess is that you probably ran into a race condition that I did not.

I will try to port the patch to a more recent kernel, including the mb_cache_shrink_scan() sent earlier (BTW, it looks good) and debug the problem.

Yes, those are good suggestions.  Once I find the problem, I will resubmit with more comments and also split it into two patches, as suggested. 

>> @@ -520,18 +647,23 @@ __mb_cache_entry_find(struct list_head *l, struct list_head *head,
>>  				ce->e_queued++;
>>  				prepare_to_wait(&mb_cache_queue, &wait,
>>  						TASK_UNINTERRUPTIBLE);
>> -				spin_unlock(&mb_cache_spinlock);
>> +				hlist_bl_unlock(head);
>>  				schedule();
>> -				spin_lock(&mb_cache_spinlock);
>> +				hlist_bl_lock(head);
>> +				mb_assert(ce->e_index_hash_p == head);
>>  				ce->e_queued--;
>>  			}
>> +			hlist_bl_unlock(head);
>>  			finish_wait(&mb_cache_queue, &wait);
>>  
>> -			if (!__mb_cache_entry_is_hashed(ce)) {
>> +			hlist_bl_lock(ce->e_block_hash_p);
>> +			if (!__mb_cache_entry_is_block_hashed(ce)) {
>>  				__mb_cache_entry_release_unlock(ce);
>> -				spin_lock(&mb_cache_spinlock);
>> +				hlist_bl_lock(head);
> 
> <---  are we missing a "hlist_bl_unlock(ce->e_block_hash_p);" here?
> 
> <---- Why is it that we call "hlist_bl_lock(head);" but not in the else clause?

The function __mb_cache_entry_release_unlock() releases both the e_block_hash and e_index_hash locks.  So we have to reacquire the e_index_hash (head) lock in the then part, and release the e_block_hash lock in the else part.
> 
>>  				return ERR_PTR(-EAGAIN);
>> -			}
>> +			} else
>> +				hlist_bl_unlock(ce->e_block_hash_p);
>> +			mb_assert(ce->e_index_hash_p == head);
>>  			return ce;
>>  		}
>>  		l = l->next; 
> 
> 
> Cheers,
> 
> 					- Ted
> 

Thanks,
Mak.

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/mbcache.c b/fs/mbcache.c
index 8c32ef3..dd45fe9 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -26,6 +26,38 @@ 
  * back on the lru list.
  */
 
+/*
+ * Lock descriptions and usage:
+ *
+ * Each hash chain of both the block and index hash tables now contains
+ * a built-in lock used to serialize accesses to the hash chain.
+ *
+ * Accesses to global data structures mb_cache_list and mb_cache_lru_list
+ * are serialized via the global spinlock mb_cache_spinlock.
+ *
+ * Lock ordering:
+ *
+ * Each block hash chain's lock has the highest order, followed by each
+ * index hash chain's lock, with mb_cache_spinlock the lowest.
+ * While holding a block hash chain lock a thread can acquire either
+ * an index hash chain lock or mb_cache_spinlock.
+ *
+ * Synchronization:
+ *
+ * Since both the e_used and e_queued members of each mb_cache_entry can
+ * be updated while traversing either a block hash chain or an index hash
+ * chain and the index hash chain lock has lower oder, each index hash
+ * chain's lock, in addition to being used to serialize accesses to the
+ * index hash chain, is also used to serialize accesses to both e_used
+ * and e_queued.
+ *
+ * To avoid having a dangling reference to an already freed
+ * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
+ * block hash chain and also no longer being referenced.  Both e_used
+ * and e_queued are 0's.  When an mb_cache_entry is explicitly
+ * freed it is first removed from a block hash chain.
+ */
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 
@@ -35,9 +67,9 @@ 
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/init.h>
+#include <linux/list_bl.h>
 #include <linux/mbcache.h>
 
-
 #ifdef MB_CACHE_DEBUG
 # define mb_debug(f...) do { \
 		printk(KERN_DEBUG f); \
@@ -99,23 +131,34 @@  static struct shrinker mb_cache_shrinker = {
 };
 
 static inline int
-__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
+__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
 {
-	return !list_empty(&ce->e_block_list);
+	return !hlist_bl_unhashed(&ce->e_block_list);
 }
 
 
 static void
-__mb_cache_entry_unhash(struct mb_cache_entry *ce)
+__mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
 {
-	if (__mb_cache_entry_is_hashed(ce)) {
-		list_del_init(&ce->e_block_list);
-		list_del(&ce->e_index.o_list);
-	}
+	if (__mb_cache_entry_is_block_hashed(ce))
+		hlist_bl_del_init(&ce->e_block_list);
+}
+
+static inline int
+__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
+{
+	return !hlist_bl_unhashed(&ce->e_index.o_list);
 }
 
 
 static void
+__mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
+{
+	if (__mb_cache_entry_is_index_hashed(ce))
+		hlist_bl_del(&ce->e_index.o_list);
+}
+
+static void
 __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
 {
 	struct mb_cache *cache = ce->e_cache;
@@ -128,8 +171,8 @@  __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
 
 static void
 __mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
-	__releases(mb_cache_spinlock)
 {
+	hlist_bl_lock(ce->e_index_hash_p);
 	/* Wake up all processes queuing for this cache entry. */
 	if (ce->e_queued)
 		wake_up_all(&mb_cache_queue);
@@ -137,15 +180,20 @@  __mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
 		ce->e_used -= MB_CACHE_WRITER;
 	ce->e_used--;
 	if (!(ce->e_used || ce->e_queued)) {
-		if (!__mb_cache_entry_is_hashed(ce))
+		hlist_bl_unlock(ce->e_index_hash_p);
+		if (!__mb_cache_entry_is_block_hashed(ce))
 			goto forget;
+		spin_lock(&mb_cache_spinlock);
 		mb_assert(list_empty(&ce->e_lru_list));
 		list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
-	}
-	spin_unlock(&mb_cache_spinlock);
+		spin_unlock(&mb_cache_spinlock);
+	} else
+		hlist_bl_unlock(ce->e_index_hash_p);
+	hlist_bl_unlock(ce->e_block_hash_p);
 	return;
 forget:
-	spin_unlock(&mb_cache_spinlock);
+	hlist_bl_unlock(ce->e_block_hash_p);
+	mb_assert(list_empty(&ce->e_lru_list));
 	__mb_cache_entry_forget(ce, GFP_KERNEL);
 }
 
@@ -164,31 +212,46 @@  forget:
 static int
 mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc)
 {
-	LIST_HEAD(free_list);
 	struct mb_cache *cache;
-	struct mb_cache_entry *entry, *tmp;
 	int count = 0;
 	int nr_to_scan = sc->nr_to_scan;
 	gfp_t gfp_mask = sc->gfp_mask;
 
 	mb_debug("trying to free %d entries", nr_to_scan);
-	spin_lock(&mb_cache_spinlock);
-	while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) {
-		struct mb_cache_entry *ce =
-			list_entry(mb_cache_lru_list.next,
-				   struct mb_cache_entry, e_lru_list);
-		list_move_tail(&ce->e_lru_list, &free_list);
-		__mb_cache_entry_unhash(ce);
+	while (nr_to_scan > 0) {
+		struct mb_cache_entry *ce;
+
+		spin_lock(&mb_cache_spinlock);
+		if (list_empty(&mb_cache_lru_list)) {
+			spin_unlock(&mb_cache_spinlock);
+			break;
+		}
+		ce = list_entry(mb_cache_lru_list.next,
+			struct mb_cache_entry, e_lru_list);
+		list_del_init(&ce->e_lru_list);
+		spin_unlock(&mb_cache_spinlock);
+
+		hlist_bl_lock(ce->e_block_hash_p);
+		hlist_bl_lock(ce->e_index_hash_p);
+		if (!(ce->e_used || ce->e_queued)) {
+			__mb_cache_entry_unhash_index(ce);
+			hlist_bl_unlock(ce->e_index_hash_p);
+			__mb_cache_entry_unhash_block(ce);
+			hlist_bl_unlock(ce->e_block_hash_p);
+			__mb_cache_entry_forget(ce, gfp_mask);
+			--nr_to_scan;
+		} else {
+			hlist_bl_unlock(ce->e_index_hash_p);
+			hlist_bl_unlock(ce->e_block_hash_p);
+		}
 	}
+	spin_lock(&mb_cache_spinlock);
 	list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
 		mb_debug("cache %s (%d)", cache->c_name,
 			  atomic_read(&cache->c_entry_count));
 		count += atomic_read(&cache->c_entry_count);
 	}
 	spin_unlock(&mb_cache_spinlock);
-	list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
-		__mb_cache_entry_forget(entry, gfp_mask);
-	}
 	return (count / 100) * sysctl_vfs_cache_pressure;
 }
 
@@ -216,18 +279,18 @@  mb_cache_create(const char *name, int bucket_bits)
 	cache->c_name = name;
 	atomic_set(&cache->c_entry_count, 0);
 	cache->c_bucket_bits = bucket_bits;
-	cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
-	                              GFP_KERNEL);
+	cache->c_block_hash = kmalloc(bucket_count *
+		sizeof(struct hlist_bl_head), GFP_KERNEL);
 	if (!cache->c_block_hash)
 		goto fail;
 	for (n=0; n<bucket_count; n++)
-		INIT_LIST_HEAD(&cache->c_block_hash[n]);
-	cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head),
-				      GFP_KERNEL);
+		INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
+	cache->c_index_hash = kmalloc(bucket_count *
+		sizeof(struct hlist_bl_head), GFP_KERNEL);
 	if (!cache->c_index_hash)
 		goto fail;
 	for (n=0; n<bucket_count; n++)
-		INIT_LIST_HEAD(&cache->c_index_hash[n]);
+		INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
 	cache->c_entry_cache = kmem_cache_create(name,
 		sizeof(struct mb_cache_entry), 0,
 		SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
@@ -276,13 +339,24 @@  mb_cache_shrink(struct block_device *bdev)
 			list_entry(l, struct mb_cache_entry, e_lru_list);
 		if (ce->e_bdev == bdev) {
 			list_move_tail(&ce->e_lru_list, &free_list);
-			__mb_cache_entry_unhash(ce);
 		}
 	}
 	spin_unlock(&mb_cache_spinlock);
 	list_for_each_safe(l, ltmp, &free_list) {
-		__mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
-						   e_lru_list), GFP_KERNEL);
+		struct mb_cache_entry *ce =
+			list_entry(l, struct mb_cache_entry, e_lru_list);
+		hlist_bl_lock(ce->e_block_hash_p);
+		hlist_bl_lock(ce->e_index_hash_p);
+		if (!(ce->e_used || ce->e_queued)) {
+			__mb_cache_entry_unhash_index(ce);
+			hlist_bl_unlock(ce->e_index_hash_p);
+			__mb_cache_entry_unhash_block(ce);
+			hlist_bl_unlock(ce->e_block_hash_p);
+			__mb_cache_entry_forget(ce, GFP_KERNEL);
+		} else {
+			hlist_bl_unlock(ce->e_index_hash_p);
+			hlist_bl_unlock(ce->e_block_hash_p);
+		}
 	}
 }
 
@@ -306,15 +380,21 @@  mb_cache_destroy(struct mb_cache *cache)
 			list_entry(l, struct mb_cache_entry, e_lru_list);
 		if (ce->e_cache == cache) {
 			list_move_tail(&ce->e_lru_list, &free_list);
-			__mb_cache_entry_unhash(ce);
 		}
 	}
 	list_del(&cache->c_cache_list);
 	spin_unlock(&mb_cache_spinlock);
 
 	list_for_each_safe(l, ltmp, &free_list) {
-		__mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
-						   e_lru_list), GFP_KERNEL);
+		struct mb_cache_entry *ce =
+			list_entry(l, struct mb_cache_entry, e_lru_list);
+		hlist_bl_lock(ce->e_block_hash_p);
+		hlist_bl_lock(ce->e_index_hash_p);
+		__mb_cache_entry_unhash_index(ce);
+		hlist_bl_unlock(ce->e_index_hash_p);
+		__mb_cache_entry_unhash_block(ce);
+		hlist_bl_unlock(ce->e_block_hash_p);
+		__mb_cache_entry_forget(ce, GFP_KERNEL);
 	}
 
 	if (atomic_read(&cache->c_entry_count) > 0) {
@@ -344,12 +424,27 @@  mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
 	struct mb_cache_entry *ce = NULL;
 
 	if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
+		struct list_head *l;
+		struct list_head *ltmp;
+
 		spin_lock(&mb_cache_spinlock);
-		if (!list_empty(&mb_cache_lru_list)) {
-			ce = list_entry(mb_cache_lru_list.next,
-					struct mb_cache_entry, e_lru_list);
-			list_del_init(&ce->e_lru_list);
-			__mb_cache_entry_unhash(ce);
+		list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
+			ce = list_entry(l, struct mb_cache_entry, e_lru_list);
+			if (ce->e_cache == cache) {
+				list_del_init(&ce->e_lru_list);
+				spin_unlock(&mb_cache_spinlock);
+				hlist_bl_lock(ce->e_block_hash_p);
+				hlist_bl_lock(ce->e_index_hash_p);
+				if (!(ce->e_used || ce->e_queued)) {
+					__mb_cache_entry_unhash_index(ce);
+					hlist_bl_unlock(ce->e_index_hash_p);
+					__mb_cache_entry_unhash_block(ce);
+					hlist_bl_unlock(ce->e_block_hash_p);
+					break;
+				}
+				hlist_bl_unlock(ce->e_index_hash_p);
+				hlist_bl_unlock(ce->e_block_hash_p);
+			}
 		}
 		spin_unlock(&mb_cache_spinlock);
 	}
@@ -359,9 +454,12 @@  mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
 			return NULL;
 		atomic_inc(&cache->c_entry_count);
 		INIT_LIST_HEAD(&ce->e_lru_list);
-		INIT_LIST_HEAD(&ce->e_block_list);
+		INIT_HLIST_BL_NODE(&ce->e_block_list);
+		INIT_HLIST_BL_NODE(&ce->e_index.o_list);
 		ce->e_cache = cache;
 		ce->e_queued = 0;
+		ce->e_block_hash_p = &cache->c_block_hash[0];
+		ce->e_index_hash_p = &cache->c_index_hash[0];
 	}
 	ce->e_used = 1 + MB_CACHE_WRITER;
 	return ce;
@@ -388,28 +486,37 @@  mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
 {
 	struct mb_cache *cache = ce->e_cache;
 	unsigned int bucket;
-	struct list_head *l;
+	struct hlist_bl_node *l;
 	int error = -EBUSY;
+	struct hlist_bl_head *block_hash_p;
+	struct hlist_bl_head *index_hash_p;
+	struct mb_cache_entry *lce;
 
 	bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 
 			   cache->c_bucket_bits);
-	spin_lock(&mb_cache_spinlock);
-	list_for_each_prev(l, &cache->c_block_hash[bucket]) {
-		struct mb_cache_entry *ce =
-			list_entry(l, struct mb_cache_entry, e_block_list);
-		if (ce->e_bdev == bdev && ce->e_block == block)
+	block_hash_p = &cache->c_block_hash[bucket];
+	hlist_bl_lock(block_hash_p);
+	hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
+		if (lce->e_bdev == bdev && lce->e_block == block)
 			goto out;
 	}
-	__mb_cache_entry_unhash(ce);
+	mb_assert(!__mb_cache_entry_is_hashed(ce));
+	__mb_cache_entry_unhash_index(ce);
+	__mb_cache_entry_unhash_block(ce);
 	ce->e_bdev = bdev;
 	ce->e_block = block;
-	list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
+	hlist_bl_add_head(&ce->e_block_list, block_hash_p);
+	ce->e_block_hash_p = block_hash_p;
 	ce->e_index.o_key = key;
 	bucket = hash_long(key, cache->c_bucket_bits);
-	list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]);
+	index_hash_p = &cache->c_index_hash[bucket];
+	hlist_bl_lock(index_hash_p);
+	hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
+	ce->e_index_hash_p = index_hash_p;
+	hlist_bl_unlock(index_hash_p);
 	error = 0;
 out:
-	spin_unlock(&mb_cache_spinlock);
+	hlist_bl_unlock(block_hash_p);
 	return error;
 }
 
@@ -424,7 +531,7 @@  out:
 void
 mb_cache_entry_release(struct mb_cache_entry *ce)
 {
-	spin_lock(&mb_cache_spinlock);
+	hlist_bl_lock(ce->e_block_hash_p);
 	__mb_cache_entry_release_unlock(ce);
 }
 
@@ -438,9 +545,13 @@  mb_cache_entry_release(struct mb_cache_entry *ce)
 void
 mb_cache_entry_free(struct mb_cache_entry *ce)
 {
-	spin_lock(&mb_cache_spinlock);
+	mb_assert(ce);
+	hlist_bl_lock(ce->e_block_hash_p);
+	hlist_bl_lock(ce->e_index_hash_p);
 	mb_assert(list_empty(&ce->e_lru_list));
-	__mb_cache_entry_unhash(ce);
+	__mb_cache_entry_unhash_index(ce);
+	hlist_bl_unlock(ce->e_index_hash_p);
+	__mb_cache_entry_unhash_block(ce);
 	__mb_cache_entry_release_unlock(ce);
 }
 
@@ -458,33 +569,46 @@  mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
 		   sector_t block)
 {
 	unsigned int bucket;
-	struct list_head *l;
+	struct hlist_bl_node *l;
 	struct mb_cache_entry *ce;
+	struct hlist_bl_head *block_hash_p;
+	int held_block_lock = 1;
 
 	bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
 			   cache->c_bucket_bits);
-	spin_lock(&mb_cache_spinlock);
-	list_for_each(l, &cache->c_block_hash[bucket]) {
-		ce = list_entry(l, struct mb_cache_entry, e_block_list);
+	block_hash_p = &cache->c_block_hash[bucket];
+	hlist_bl_lock(block_hash_p);
+	hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
+		mb_assert(ce->e_block_hash_p == block_hash_p);
 		if (ce->e_bdev == bdev && ce->e_block == block) {
 			DEFINE_WAIT(wait);
 
+			spin_lock(&mb_cache_spinlock);
 			if (!list_empty(&ce->e_lru_list))
 				list_del_init(&ce->e_lru_list);
+			spin_unlock(&mb_cache_spinlock);
 
+			hlist_bl_lock(ce->e_index_hash_p);
 			while (ce->e_used > 0) {
 				ce->e_queued++;
+				hlist_bl_unlock(ce->e_index_hash_p);
 				prepare_to_wait(&mb_cache_queue, &wait,
 						TASK_UNINTERRUPTIBLE);
-				spin_unlock(&mb_cache_spinlock);
+				if (held_block_lock) {
+					hlist_bl_unlock(block_hash_p);
+					held_block_lock = 0;
+				}
 				schedule();
-				spin_lock(&mb_cache_spinlock);
+				hlist_bl_lock(ce->e_index_hash_p);
 				ce->e_queued--;
 			}
-			finish_wait(&mb_cache_queue, &wait);
 			ce->e_used += 1 + MB_CACHE_WRITER;
+			hlist_bl_unlock(ce->e_index_hash_p);
+			finish_wait(&mb_cache_queue, &wait);
 
-			if (!__mb_cache_entry_is_hashed(ce)) {
+			if (!held_block_lock)
+				hlist_bl_lock(block_hash_p);
+			if (!__mb_cache_entry_is_block_hashed(ce)) {
 				__mb_cache_entry_release_unlock(ce);
 				return NULL;
 			}
@@ -494,24 +618,27 @@  mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
 	ce = NULL;
 
 cleanup:
-	spin_unlock(&mb_cache_spinlock);
+	hlist_bl_unlock(block_hash_p);
 	return ce;
 }
 
 #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
 
 static struct mb_cache_entry *
-__mb_cache_entry_find(struct list_head *l, struct list_head *head,
+__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
 		      struct block_device *bdev, unsigned int key)
 {
-	while (l != head) {
+	while (l != NULL) {
 		struct mb_cache_entry *ce =
-			list_entry(l, struct mb_cache_entry, e_index.o_list);
+			hlist_bl_entry(l, struct mb_cache_entry,
+				e_index.o_list);
 		if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
 			DEFINE_WAIT(wait);
 
+			spin_lock(&mb_cache_spinlock);
 			if (!list_empty(&ce->e_lru_list))
 				list_del_init(&ce->e_lru_list);
+			spin_unlock(&mb_cache_spinlock);
 
 			/* Incrementing before holding the lock gives readers
 			   priority over writers. */
@@ -520,18 +647,23 @@  __mb_cache_entry_find(struct list_head *l, struct list_head *head,
 				ce->e_queued++;
 				prepare_to_wait(&mb_cache_queue, &wait,
 						TASK_UNINTERRUPTIBLE);
-				spin_unlock(&mb_cache_spinlock);
+				hlist_bl_unlock(head);
 				schedule();
-				spin_lock(&mb_cache_spinlock);
+				hlist_bl_lock(head);
+				mb_assert(ce->e_index_hash_p == head);
 				ce->e_queued--;
 			}
+			hlist_bl_unlock(head);
 			finish_wait(&mb_cache_queue, &wait);
 
-			if (!__mb_cache_entry_is_hashed(ce)) {
+			hlist_bl_lock(ce->e_block_hash_p);
+			if (!__mb_cache_entry_is_block_hashed(ce)) {
 				__mb_cache_entry_release_unlock(ce);
-				spin_lock(&mb_cache_spinlock);
+				hlist_bl_lock(head);
 				return ERR_PTR(-EAGAIN);
-			}
+			} else
+				hlist_bl_unlock(ce->e_block_hash_p);
+			mb_assert(ce->e_index_hash_p == head);
 			return ce;
 		}
 		l = l->next;
@@ -557,13 +689,17 @@  mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
 			  unsigned int key)
 {
 	unsigned int bucket = hash_long(key, cache->c_bucket_bits);
-	struct list_head *l;
-	struct mb_cache_entry *ce;
+	struct hlist_bl_node *l;
+	struct mb_cache_entry *ce = NULL;
+	struct hlist_bl_head *index_hash_p;
 
-	spin_lock(&mb_cache_spinlock);
-	l = cache->c_index_hash[bucket].next;
-	ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
-	spin_unlock(&mb_cache_spinlock);
+	index_hash_p = &cache->c_index_hash[bucket];
+	hlist_bl_lock(index_hash_p);
+	if (!hlist_bl_empty(index_hash_p)) {
+		l = hlist_bl_first(index_hash_p);
+		ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
+	}
+	hlist_bl_unlock(index_hash_p);
 	return ce;
 }
 
@@ -592,13 +728,17 @@  mb_cache_entry_find_next(struct mb_cache_entry *prev,
 {
 	struct mb_cache *cache = prev->e_cache;
 	unsigned int bucket = hash_long(key, cache->c_bucket_bits);
-	struct list_head *l;
+	struct hlist_bl_node *l;
 	struct mb_cache_entry *ce;
+	struct hlist_bl_head *index_hash_p;
 
-	spin_lock(&mb_cache_spinlock);
+	index_hash_p = &cache->c_index_hash[bucket];
+	mb_assert(prev->e_index_hash_p == index_hash_p);
+	hlist_bl_lock(index_hash_p);
+	mb_assert(!hlist_bl_empty(index_hash_p));
 	l = prev->e_index.o_list.next;
-	ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
-	__mb_cache_entry_release_unlock(prev);
+	ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
+	hlist_bl_unlock(index_hash_p);
 	return ce;
 }
 
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index 5525d37..89826c0 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -11,11 +11,13 @@  struct mb_cache_entry {
 	unsigned short			e_queued;
 	struct block_device		*e_bdev;
 	sector_t			e_block;
-	struct list_head		e_block_list;
+	struct hlist_bl_node		e_block_list;
 	struct {
-		struct list_head	o_list;
+		struct hlist_bl_node	o_list;
 		unsigned int		o_key;
 	} e_index;
+	struct hlist_bl_head		*e_block_hash_p;
+	struct hlist_bl_head		*e_index_hash_p;
 };
 
 struct mb_cache {
@@ -25,8 +27,8 @@  struct mb_cache {
 	int				c_max_entries;
 	int				c_bucket_bits;
 	struct kmem_cache		*c_entry_cache;
-	struct list_head		*c_block_hash;
-	struct list_head		*c_index_hash;
+	struct hlist_bl_head		*c_block_hash;
+	struct hlist_bl_head		*c_index_hash;
 };
 
 /* Functions on caches */