diff mbox series

[RFC] replace revoke hash table with rhashtable

Message ID 96FE09AA-171C-49B1-B434-505C15FEB435@whamcloud.com
State New
Headers show
Series [RFC] replace revoke hash table with rhashtable | expand

Commit Message

Alex Zhuravlev Aug. 31, 2021, 2:49 p.m. UTC
Hi,

Not so long ago we noticed that journal replay can take quite a lot (hours) in cases where many journaled blocks were freed during a short period.
I benchmarked hash table used by revoke code, basically it’s lookup+insert like jbd2 does at replay:

1048576 records - 95 seconds
2097152 records - 580 seconds

Then I benchmarked rhashtable:
1048576 records - 2 seconds
2097152 records - 3 seconds
4194304 records - 7 seconds

So, here is a patch replacing existing fixed-size hash table with rhashtable, please have a look.

Thanks, Alex


From 64b9161db7fee4eea665833765221d8a7e5903b6 Mon Sep 17 00:00:00 2001
From: Alex Zhuravlev <bzzz@whamcloud.com>
Date: Tue, 31 Aug 2021 11:53:09 +0300
Subject: [PATCH] jbd2 to replace fixed-size revoke hashtable with rhashtable

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
---
fs/jbd2/journal.c    |   5 +-
fs/jbd2/revoke.c     | 255 +++++++++++++++----------------------------
include/linux/jbd2.h |   8 +-
3 files changed, 90 insertions(+), 178 deletions(-)

Comments

Andreas Dilger Sept. 1, 2021, 6:32 p.m. UTC | #1
On Aug 31, 2021, at 8:49 AM, Alex Zhuravlev <azhuravlev@whamcloud.com> wrote:
> 
> Hi,
> 
> Not so long ago we noticed that journal replay can take quite a lot (hours)
> in cases where many journaled blocks were freed during a short period.

It may be worthwhile to mention this was a case with a 4GB journal size.

> I benchmarked hash table used by revoke code, basically it’s lookup+insert
> like jbd2 does at replay:
> 
> 1048576 records - 95 seconds
> 2097152 records - 580 seconds
> 
> Then I benchmarked rhashtable:
> 1048576 records - 2 seconds
> 2097152 records - 3 seconds
> 4194304 records - 7 seconds
> 
> So, here is a patch replacing existing fixed-size hash table with rhashtable, please have a look.
> 
> Thanks, Alex

Alex,
the patch looks good from both a performance standpoint, as well as
a good reduction in lines (and possibly memory, for the cases where
there are fewer entries in the hash than the static table size).

I did notice some lines are using 8-space indents instead of tabs.
That can be fixed if there are any other comments, and you resubmit
without [RFC].

Cheers, Andreas
diff mbox series

Patch

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 35302bc192eb..6e3a2cc9dfd2 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1370,7 +1370,7 @@  static journal_t *journal_init_common(struct block_device *bdev,
	journal->j_flags = JBD2_ABORT;

	/* Set up a default-sized revoke table for the new mount. */
-	err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
+	err = jbd2_journal_init_revoke(journal);
	if (err)
		goto err_cleanup;

@@ -3136,8 +3136,6 @@  static int __init journal_init_caches(void)
	int ret;

	ret = jbd2_journal_init_revoke_record_cache();
-	if (ret == 0)
-		ret = jbd2_journal_init_revoke_table_cache();
	if (ret == 0)
		ret = jbd2_journal_init_journal_head_cache();
	if (ret == 0)
@@ -3152,7 +3150,6 @@  static int __init journal_init_caches(void)
static void jbd2_journal_destroy_caches(void)
{
	jbd2_journal_destroy_revoke_record_cache();
-	jbd2_journal_destroy_revoke_table_cache();
	jbd2_journal_destroy_journal_head_cache();
	jbd2_journal_destroy_handle_cache();
	jbd2_journal_destroy_inode_cache();
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index fa608788b93d..4533bd49e879 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -90,10 +90,10 @@ 
#include <linux/bio.h>
#include <linux/log2.h>
#include <linux/hash.h>
+#include <linux/rhashtable.h>
#endif

static struct kmem_cache *jbd2_revoke_record_cache;
-static struct kmem_cache *jbd2_revoke_table_cache;

/* Each revoke record represents one single revoked block.  During
   journal replay, this involves recording the transaction ID of the
@@ -101,23 +101,17 @@  static struct kmem_cache *jbd2_revoke_table_cache;

struct jbd2_revoke_record_s
{
-	struct list_head  hash;
+	struct rhash_head linkage;
	tid_t		  sequence;	/* Used for recovery only */
	unsigned long long	  blocknr;
};

-
-/* The revoke table is just a simple hash table of revoke records. */
-struct jbd2_revoke_table_s
-{
-	/* It is conceivable that we might want a larger hash table
-	 * for recovery.  Must be a power of two. */
-	int		  hash_size;
-	int		  hash_shift;
-	struct list_head *hash_table;
+static const struct rhashtable_params revoke_rhashtable_params = {
+	.key_len     = sizeof(unsigned long long),
+	.key_offset  = offsetof(struct jbd2_revoke_record_s, blocknr),
+	.head_offset = offsetof(struct jbd2_revoke_record_s, linkage),
};

-
#ifdef __KERNEL__
static void write_one_revoke_record(transaction_t *,
				    struct list_head *,
@@ -126,18 +120,10 @@  static void write_one_revoke_record(transaction_t *,
static void flush_descriptor(journal_t *, struct buffer_head *, int);
#endif

-/* Utility functions to maintain the revoke table */
-
-static inline int hash(journal_t *journal, unsigned long long block)
-{
-	return hash_64(block, journal->j_revoke->hash_shift);
-}
-
static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
			      tid_t seq)
{
-	struct list_head *hash_list;
-	struct jbd2_revoke_record_s *record;
+	struct jbd2_revoke_record_s *record, *old;
	gfp_t gfp_mask = GFP_NOFS;

	if (journal_oom_retry)
@@ -148,10 +134,12 @@  static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,

	record->sequence = seq;
	record->blocknr = blocknr;
-	hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
-	spin_lock(&journal->j_revoke_lock);
-	list_add(&record->hash, hash_list);
-	spin_unlock(&journal->j_revoke_lock);
+	old = rhashtable_lookup_get_insert_fast(journal->j_revoke,
+					 &record->linkage, revoke_rhashtable_params);
+	if (old) {
+		BUG_ON(record->sequence != seq);
+		kmem_cache_free(jbd2_revoke_record_cache, record);
+	}
	return 0;
}

@@ -160,22 +148,8 @@  static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
						      unsigned long long blocknr)
{
-	struct list_head *hash_list;
-	struct jbd2_revoke_record_s *record;
-
-	hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
-
-	spin_lock(&journal->j_revoke_lock);
-	record = (struct jbd2_revoke_record_s *) hash_list->next;
-	while (&(record->hash) != hash_list) {
-		if (record->blocknr == blocknr) {
-			spin_unlock(&journal->j_revoke_lock);
-			return record;
-		}
-		record = (struct jbd2_revoke_record_s *) record->hash.next;
-	}
-	spin_unlock(&journal->j_revoke_lock);
-	return NULL;
+	return rhashtable_lookup_fast(journal->j_revoke, &blocknr,
+				      revoke_rhashtable_params);
}

void jbd2_journal_destroy_revoke_record_cache(void)
@@ -184,12 +158,6 @@  void jbd2_journal_destroy_revoke_record_cache(void)
	jbd2_revoke_record_cache = NULL;
}

-void jbd2_journal_destroy_revoke_table_cache(void)
-{
-	kmem_cache_destroy(jbd2_revoke_table_cache);
-	jbd2_revoke_table_cache = NULL;
-}
-
int __init jbd2_journal_init_revoke_record_cache(void)
{
	J_ASSERT(!jbd2_revoke_record_cache);
@@ -203,85 +171,27 @@  int __init jbd2_journal_init_revoke_record_cache(void)
	return 0;
}

-int __init jbd2_journal_init_revoke_table_cache(void)
-{
-	J_ASSERT(!jbd2_revoke_table_cache);
-	jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s,
-					     SLAB_TEMPORARY);
-	if (!jbd2_revoke_table_cache) {
-		pr_emerg("JBD2: failed to create revoke_table cache\n");
-		return -ENOMEM;
-	}
-	return 0;
-}
-
-static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
-{
-	int shift = 0;
-	int tmp = hash_size;
-	struct jbd2_revoke_table_s *table;
-
-	table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
-	if (!table)
-		goto out;
-
-	while((tmp >>= 1UL) != 0UL)
-		shift++;
-
-	table->hash_size = hash_size;
-	table->hash_shift = shift;
-	table->hash_table =
-		kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL);
-	if (!table->hash_table) {
-		kmem_cache_free(jbd2_revoke_table_cache, table);
-		table = NULL;
-		goto out;
-	}
-
-	for (tmp = 0; tmp < hash_size; tmp++)
-		INIT_LIST_HEAD(&table->hash_table[tmp]);
-
-out:
-	return table;
-}
-
-static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
-{
-	int i;
-	struct list_head *hash_list;
-
-	for (i = 0; i < table->hash_size; i++) {
-		hash_list = &table->hash_table[i];
-		J_ASSERT(list_empty(hash_list));
-	}
-
-	kfree(table->hash_table);
-	kmem_cache_free(jbd2_revoke_table_cache, table);
-}
-
/* Initialise the revoke table for a given journal to a given size. */
-int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
+int jbd2_journal_init_revoke(journal_t *journal)
{
-	J_ASSERT(journal->j_revoke_table[0] == NULL);
-	J_ASSERT(is_power_of_2(hash_size));
+	int rc;

-	journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size);
-	if (!journal->j_revoke_table[0])
+	rc = rhashtable_init(&journal->j_revoke_table[0], &revoke_rhashtable_params);
+	if (rc)
		goto fail0;

-	journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size);
-	if (!journal->j_revoke_table[1])
+	rc = rhashtable_init(&journal->j_revoke_table[1], &revoke_rhashtable_params);
+	if (rc)
		goto fail1;

-	journal->j_revoke = journal->j_revoke_table[1];
+	journal->j_revoke = &journal->j_revoke_table[1];

	spin_lock_init(&journal->j_revoke_lock);

	return 0;

fail1:
-	jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]);
-	journal->j_revoke_table[0] = NULL;
+	rhashtable_destroy(&journal->j_revoke_table[0]);
fail0:
	return -ENOMEM;
}
@@ -290,10 +200,8 @@  int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
void jbd2_journal_destroy_revoke(journal_t *journal)
{
	journal->j_revoke = NULL;
-	if (journal->j_revoke_table[0])
-		jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]);
-	if (journal->j_revoke_table[1])
-		jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]);
+	rhashtable_destroy(&journal->j_revoke_table[0]);
+	rhashtable_destroy(&journal->j_revoke_table[1]);
}


@@ -446,9 +354,8 @@  int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
		if (record) {
			jbd_debug(4, "cancelled existing revoke on "
				  "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
-			spin_lock(&journal->j_revoke_lock);
-			list_del(&record->hash);
-			spin_unlock(&journal->j_revoke_lock);
+			rhashtable_remove_fast(journal->j_revoke, &record->linkage,
+						revoke_rhashtable_params);
			kmem_cache_free(jbd2_revoke_record_cache, record);
			did_revoke = 1;
		}
@@ -483,27 +390,29 @@  int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 */
void jbd2_clear_buffer_revoked_flags(journal_t *journal)
{
-	struct jbd2_revoke_table_s *revoke = journal->j_revoke;
-	int i = 0;
-
-	for (i = 0; i < revoke->hash_size; i++) {
-		struct list_head *hash_list;
-		struct list_head *list_entry;
-		hash_list = &revoke->hash_table[i];
-
-		list_for_each(list_entry, hash_list) {
-			struct jbd2_revoke_record_s *record;
-			struct buffer_head *bh;
-			record = (struct jbd2_revoke_record_s *)list_entry;
-			bh = __find_get_block(journal->j_fs_dev,
-					      record->blocknr,
-					      journal->j_blocksize);
-			if (bh) {
-				clear_buffer_revoked(bh);
-				__brelse(bh);
-			}
+	struct rhashtable *revoke = journal->j_revoke;
+	struct jbd2_revoke_record_s *record;
+	struct rhashtable_iter iter;
+
+        rhashtable_walk_enter(revoke, &iter);
+        rhashtable_walk_start(&iter);
+        while ((record = rhashtable_walk_next(&iter)) != NULL) {
+		struct buffer_head *bh;
+
+		if (IS_ERR(record))
+			continue;
+		rhashtable_walk_stop(&iter);
+		bh = __find_get_block(journal->j_fs_dev,
+				      record->blocknr,
+				      journal->j_blocksize);
+		if (bh) {
+			clear_buffer_revoked(bh);
+			__brelse(bh);
		}
-	}
+		rhashtable_walk_start(&iter);
+        }
+        rhashtable_walk_stop(&iter);
+        rhashtable_walk_exit(&iter);
}

/* journal_switch_revoke table select j_revoke for next transaction
@@ -512,15 +421,12 @@  void jbd2_clear_buffer_revoked_flags(journal_t *journal)
 */
void jbd2_journal_switch_revoke_table(journal_t *journal)
{
-	int i;
-
-	if (journal->j_revoke == journal->j_revoke_table[0])
-		journal->j_revoke = journal->j_revoke_table[1];
+	if (journal->j_revoke == &journal->j_revoke_table[0])
+		journal->j_revoke = &journal->j_revoke_table[1];
	else
-		journal->j_revoke = journal->j_revoke_table[0];
+		journal->j_revoke = &journal->j_revoke_table[0];

-	for (i = 0; i < journal->j_revoke->hash_size; i++)
-		INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
+	/* XXX: check rhashtable is empty? reinitialize it? */
}

/*
@@ -533,31 +439,36 @@  void jbd2_journal_write_revoke_records(transaction_t *transaction,
	journal_t *journal = transaction->t_journal;
	struct buffer_head *descriptor;
	struct jbd2_revoke_record_s *record;
-	struct jbd2_revoke_table_s *revoke;
-	struct list_head *hash_list;
-	int i, offset, count;
+	struct rhashtable_iter iter;
+	struct rhashtable *revoke;
+	int offset, count;

	descriptor = NULL;
	offset = 0;
	count = 0;

	/* select revoke table for committing transaction */
-	revoke = journal->j_revoke == journal->j_revoke_table[0] ?
-		journal->j_revoke_table[1] : journal->j_revoke_table[0];
-
-	for (i = 0; i < revoke->hash_size; i++) {
-		hash_list = &revoke->hash_table[i];
-
-		while (!list_empty(hash_list)) {
-			record = (struct jbd2_revoke_record_s *)
-				hash_list->next;
+	revoke = journal->j_revoke == &journal->j_revoke_table[0] ?
+		&journal->j_revoke_table[1] : &journal->j_revoke_table[0];
+
+	rhashtable_walk_enter(revoke, &iter);
+	rhashtable_walk_start(&iter);
+	while ((record = rhashtable_walk_next(&iter)) != NULL) {
+		if (IS_ERR(record))
+			continue;
+		if (rhashtable_remove_fast(revoke,
+					   &record->linkage,
+					   revoke_rhashtable_params) == 0) {
+			rhashtable_walk_stop(&iter);
			write_one_revoke_record(transaction, log_bufs,
						&descriptor, &offset, record);
+			rhashtable_walk_start(&iter);
			count++;
-			list_del(&record->hash);
			kmem_cache_free(jbd2_revoke_record_cache, record);
		}
	}
+	rhashtable_walk_stop(&iter);
+	rhashtable_walk_exit(&iter);
	if (descriptor)
		flush_descriptor(journal, descriptor, offset);
	jbd_debug(1, "Wrote %d revoke records\n", count);
@@ -725,19 +636,23 @@  int jbd2_journal_test_revoke(journal_t *journal,

void jbd2_journal_clear_revoke(journal_t *journal)
{
-	int i;
-	struct list_head *hash_list;
+	struct rhashtable_iter iter;
	struct jbd2_revoke_record_s *record;
-	struct jbd2_revoke_table_s *revoke;
+	struct rhashtable *revoke;

	revoke = journal->j_revoke;

-	for (i = 0; i < revoke->hash_size; i++) {
-		hash_list = &revoke->hash_table[i];
-		while (!list_empty(hash_list)) {
-			record = (struct jbd2_revoke_record_s*) hash_list->next;
-			list_del(&record->hash);
+        rhashtable_walk_enter(revoke, &iter);
+        rhashtable_walk_start(&iter);
+        while ((record = rhashtable_walk_next(&iter)) != NULL) {
+		if (IS_ERR(record))
+			continue;
+                if (rhashtable_remove_fast(revoke,
+                                           &record->linkage,
+                                           revoke_rhashtable_params) == 0) {
			kmem_cache_free(jbd2_revoke_record_cache, record);
-		}
-	}
+                }
+        }
+        rhashtable_walk_stop(&iter);
+        rhashtable_walk_exit(&iter);
}
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index fd933c45281a..dcde4329cdd1 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -29,6 +29,7 @@ 
#include <linux/bit_spinlock.h>
#include <linux/blkdev.h>
#include <crypto/hash.h>
+#include <linux/rhashtable.h>
#endif

#define journal_oom_retry 1
@@ -1135,12 +1136,12 @@  struct journal_s
	 * The revoke table - maintains the list of revoked blocks in the
	 * current transaction.
	 */
-	struct jbd2_revoke_table_s *j_revoke;
+	struct rhashtable	*j_revoke;

	/**
	 * @j_revoke_table: Alternate revoke tables for j_revoke.
	 */
-	struct jbd2_revoke_table_s *j_revoke_table[2];
+	struct rhashtable	j_revoke_table[2];

	/**
	 * @j_wbuf: Array of bhs for jbd2_journal_commit_transaction.
@@ -1625,8 +1626,7 @@  static inline void jbd2_free_inode(struct jbd2_inode *jinode)
}

/* Primary revoke support */
-#define JOURNAL_REVOKE_DEFAULT_HASH 256
-extern int	   jbd2_journal_init_revoke(journal_t *, int);
+extern int	   jbd2_journal_init_revoke(journal_t *);
extern void	   jbd2_journal_destroy_revoke_record_cache(void);
extern void	   jbd2_journal_destroy_revoke_table_cache(void);
extern int __init jbd2_journal_init_revoke_record_cache(void);