diff mbox

[RFC,-v2,7/9] ext4: don't use the block freed but not yet committed during buddy initialization

Message ID 1225733769-23734-7-git-send-email-aneesh.kumar@linux.vnet.ibm.com
State Superseded, archived
Headers show

Commit Message

Aneesh Kumar K.V Nov. 3, 2008, 5:36 p.m. UTC
When we generate buddy cache(especially during resize) we need to make
sure we don't use the blocks freed but not yet comitted. This make
sure we have the right value of free blocks count in the group
info and also in the bitmap. This also ensures the ordered mode
consistency

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/mballoc.c |   95 ++++++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 72 insertions(+), 23 deletions(-)

Comments

Theodore Ts'o Nov. 4, 2008, 5:15 p.m. UTC | #1
On Mon, Nov 03, 2008 at 11:06:07PM +0530, Aneesh Kumar K.V wrote:
> +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
> +					ext4_group_t group,
> +					struct ext4_free_data *entry)
> +{
	...
> +	if (n->rb_left) {
> +		new_entry = rb_entry(n->rb_left, struct ext4_free_data, node);
> +		ext4_mb_generate_from_freelist(sb, bitmap, group, new_entry);
> +	}
> +	if (n->rb_right) {
> +		new_entry = rb_entry(n->rb_right, struct ext4_free_data, node);
> +		ext4_mb_generate_from_freelist(sb, bitmap, group, new_entry);
> +	}

ext4_mb_generate_from_freelist() is recursively calling itself, which
could easily blow the stack if there are a large number of items on
the free list (remember, this can include data blocks if
!ext4_should_writeback_data()).

You should probably use rb_first and rb_next in a loop rather than a
recursive descent.  I also remain concerned that
ext4_mb_generate_from_freelist() is could burn a large amount of CPU
in some cases, and as I said on the conference call, if there is a way
to avoid it, that would be a Good Thing.

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Aneesh Kumar K.V Nov. 5, 2008, 3:23 p.m. UTC | #2
On Tue, Nov 04, 2008 at 12:15:15PM -0500, Theodore Tso wrote:
> On Mon, Nov 03, 2008 at 11:06:07PM +0530, Aneesh Kumar K.V wrote:
> > +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
> > +					ext4_group_t group,
> > +					struct ext4_free_data *entry)
> > +{
> 	...
> > +	if (n->rb_left) {
> > +		new_entry = rb_entry(n->rb_left, struct ext4_free_data, node);
> > +		ext4_mb_generate_from_freelist(sb, bitmap, group, new_entry);
> > +	}
> > +	if (n->rb_right) {
> > +		new_entry = rb_entry(n->rb_right, struct ext4_free_data, node);
> > +		ext4_mb_generate_from_freelist(sb, bitmap, group, new_entry);
> > +	}
> 
> ext4_mb_generate_from_freelist() is recursively calling itself, which
> could easily blow the stack if there are a large number of items on
> the free list (remember, this can include data blocks if
> !ext4_should_writeback_data()).
> 
> You should probably use rb_first and rb_next in a loop rather than a
> recursive descent. 

Will do this.

>I also remain concerned that
> ext4_mb_generate_from_freelist() is could burn a large amount of CPU
> in some cases, and as I said on the conference call, if there is a way
> to avoid it, that would be a Good Thing.

We need ext4_mb_generate_from_freelist for multiple case

a) While generating the buddy information we need to make sure we don't
use the blocks released but not yet committed to disk. We may force
buddy rebuild because we added a new group via resize. We need to do
a buddy rebuild irrespective of whether we use ext4_mb_free_blocks or
EXT4_MB_GRP_NEED_INIT flag

b) We we release inode preallocation we look at the block bitmap
and mark the blocks found free in the bitmap using mb_free_blocks.
Now if we  allocate some blocks and later free some of them we may 
have called ext4_mb_free blocks on them which mean we would have
marked the blocks free on bitmap. Now on file close we release
inode pa. We look at the block bitmap and if the block is free
in bitmap we call mb_free_blocks. Also on committing the transaction we
call mb_free_blocks on them. To avoid the above we need to make sure
when we discard_inode_pa we look at a bitmap that have block freed
and not yet committed as used.

-aneesh
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 34a365e..f6d9e30 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -335,6 +335,9 @@ 
 static struct kmem_cache *ext4_free_ext_cachep;
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group);
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+					ext4_group_t group,
+					struct ext4_free_data *entry);
 static int ext4_mb_init_per_dev_proc(struct super_block *sb);
 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
@@ -858,7 +861,9 @@  static int ext4_mb_init_cache(struct page *page, char *incore)
 			/*
 			 * incore got set to the group block bitmap below
 			 */
+			ext4_lock_group(sb, group);
 			ext4_mb_generate_buddy(sb, data, incore, group);
+			ext4_unlock_group(sb, group);
 			incore = NULL;
 		} else {
 			/* this is block of bitmap */
@@ -872,6 +877,7 @@  static int ext4_mb_init_cache(struct page *page, char *incore)
 
 			/* mark all preallocated blks used in in-core bitmap */
 			ext4_mb_generate_from_pa(sb, data, group);
+			ext4_mb_generate_from_freelist(sb, data, group, NULL);
 			ext4_unlock_group(sb, group);
 
 			/* set incore so that the buddy information can be
@@ -3432,6 +3438,43 @@  ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 }
 
 /*
+ * the function goes through all block freed in the group
+ * but not yet committed and marks them used in in-core bitmap.
+ * buddy must be generated from this bitmap
+ * Need to be called with ext4 group lock (ext4_lock_group)
+ */
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+					ext4_group_t group,
+					struct ext4_free_data *entry)
+{
+	struct rb_node *n;
+	struct ext4_group_info *grp;
+	struct ext4_free_data *new_entry;
+	if (entry == NULL) {
+		grp = ext4_get_group_info(sb, group);
+		n = grp->bb_free_root.rb_node;
+		entry = rb_entry(n, struct ext4_free_data, node);
+
+	} else
+		n = &entry->node;
+
+	if (n == NULL)
+		return;
+	if (n->rb_left) {
+		new_entry = rb_entry(n->rb_left, struct ext4_free_data, node);
+		ext4_mb_generate_from_freelist(sb, bitmap, group, new_entry);
+	}
+	if (n->rb_right) {
+		new_entry = rb_entry(n->rb_right, struct ext4_free_data, node);
+		ext4_mb_generate_from_freelist(sb, bitmap, group, new_entry);
+	}
+	mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+					bitmap, entry->start_blk,
+					entry->count);
+	return;
+}
+
+/*
  * the function goes through all preallocation in this group and marks them
  * used in in-core bitmap. buddy must be generated from this bitmap
  * Need to be called with ext4 group lock (ext4_lock_group)
@@ -4532,27 +4575,22 @@  static int can_merge(struct ext4_free_data *entry1,
 
 static noinline_for_stack int
 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
-			  ext4_group_t group, ext4_grpblk_t block, int count)
+			struct ext4_free_data *new_entry)
 {
+	ext4_grpblk_t block;
+	struct ext4_free_data *entry;
 	struct ext4_group_info *db = e4b->bd_info;
 	struct super_block *sb = e4b->bd_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_free_data *entry, *new_entry;
 	struct rb_node **n = &db->bb_free_root.rb_node, *node;
 	struct rb_node *parent = NULL, *new_node;
 
-
 	BUG_ON(e4b->bd_bitmap_page == NULL);
 	BUG_ON(e4b->bd_buddy_page == NULL);
 
-	new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
-	new_entry->start_blk = block;
-	new_entry->group  = group;
-	new_entry->count = count;
-	new_entry->t_tid = handle->h_transaction->t_tid;
 	new_node = &new_entry->node;
+	block = new_entry->start_blk;
 
-	ext4_lock_group(sb, group);
 	if (!*n) {
 		/* first free block exent. We need to
 		   protect buddy cache from being freed,
@@ -4570,7 +4608,6 @@  ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 		else if (block >= (entry->start_blk + entry->count))
 			n = &(*n)->rb_right;
 		else {
-			ext4_unlock_group(sb, group);
 			ext4_error(sb, __func__,
 			    "Double free of blocks %d (%d %d)\n",
 			    block, entry->start_blk, entry->count);
@@ -4612,7 +4649,6 @@  ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	spin_lock(&sbi->s_md_lock);
 	list_add(&new_entry->list, &handle->h_transaction->t_private_list);
 	spin_unlock(&sbi->s_md_lock);
-	ext4_unlock_group(sb, group);
 	return 0;
 }
 
@@ -4717,15 +4753,6 @@  void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 			BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
 	}
 #endif
-	mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
-			bit, count);
-
-	/* We dirtied the bitmap block */
-	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
-	if (err)
-		goto error_return;
-
 	if (ac) {
 		ac->ac_b_ex.fe_group = block_group;
 		ac->ac_b_ex.fe_start = bit;
@@ -4739,11 +4766,29 @@  void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 		goto error_return;
 	}
 	if (metadata) {
-		/* blocks being freed are metadata. these blocks shouldn't
-		 * be used until this transaction is committed */
-		ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
+		struct ext4_free_data *new_entry;
+		/*
+		 * blocks being freed are metadata. these blocks shouldn't
+		 * be used until this transaction is committed
+		 */
+		new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+		new_entry->start_blk = bit;
+		new_entry->group  = block_group;
+		new_entry->count = count;
+		new_entry->t_tid = handle->h_transaction->t_tid;
+		ext4_lock_group(sb, block_group);
+		mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+				bit, count);
+		ext4_mb_free_metadata(handle, &e4b, new_entry);
+		ext4_unlock_group(sb, block_group);
 	} else {
 		ext4_lock_group(sb, block_group);
+		/* need to update group_info->bb_free and bitmap
+		 * with group lock held. generate_buddy look at
+		 * them with group lock_held
+		 */
+		mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+				bit, count);
 		mb_free_blocks(inode, &e4b, bit, count);
 		ext4_mb_return_to_preallocation(inode, &e4b, block, count);
 		ext4_unlock_group(sb, block_group);
@@ -4766,6 +4811,10 @@  void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 
 	*freed += count;
 
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+
 	/* And the group descriptor block */
 	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
 	ret = ext4_journal_dirty_metadata(handle, gd_bh);