Patchwork [2/2] ext4: Move quota initialization out of inode allocation transaction

login
register
mail settings
Submitter Theodore Ts'o
Date April 18, 2013, 7:07 p.m.
Message ID <20130418190738.GB5588@thunk.org>
Download mbox | patch
Permalink /patch/237720/
State Superseded
Headers show

Comments

Theodore Ts'o - April 18, 2013, 7:07 p.m.
On Tue, Apr 09, 2013 at 12:42:37PM -0400, Theodore Ts'o wrote:
> 
> Thanks, added to the dev branch for testing.
> 

This patch was causing a reliable failure for xfstests generic/083
using bigalloc.  I bisected down this patch, and then localized the
failure down to the insert_inode_locked() call in __ext4_new_inode()
failing (which means that there is an old inode in the inode hash
lists with the same inode number)

The error seems to be caused by the error handling code paths.  I
believe the problem was caused by clear_nlink() and unlock_new_inode()
getting called in some error cleanup paths when the inode that hadn't
yet been inserted into inode hash lists.

Here is a revamped patch which has a cleaned up set of error paths.
The only change is in the cleanup paths, and this causes the
generic/083 test for bigalloc to pass where it was previously failing.

	    	     	      	      	    - Ted

commit 623df013725903824ac341a32e9c11c72752d8eb
Author: Jan Kara <jack@suse.cz>
Date:   Thu Apr 18 15:03:28 2013 -0400

    ext4: move quota initialization out of inode allocation transaction
    
    Inode allocation transaction is pretty heavy (246 credits with quotas
    and extents before previous patch, still around 200 after it).  This is
    mostly due to credits required for allocation of quota structures
    (credits there are heavily overestimated but it's difficult to make
    better estimates if we don't want to wire non-trivial assumptions about
    quota format into filesystem).
    
    So move quota initialization out of allocation transaction. That way
    transaction for quota structure allocation will be started only if we
    need to look up quota structure on disk (rare) and furthermore it will
    be started for each quota type separately, not for all of them at once.
    This reduces maximum transaction size to 34 is most cases and to 73 in
    the worst case.
    
    [ Modified by tytso to clean up the cleanup paths for error handling.
      Also use a separate call to ext4_std_error() for each failure so it
      is easier for someone who is debugging a problem in this function to
      determine which function call failed. ]
    
    Signed-off-by: Jan Kara <jack@suse.cz>
    Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara - April 18, 2013, 9:46 p.m.
On Thu 18-04-13 15:07:38, Ted Tso wrote:
> On Tue, Apr 09, 2013 at 12:42:37PM -0400, Theodore Ts'o wrote:
> > 
> > Thanks, added to the dev branch for testing.
> > 
> 
> This patch was causing a reliable failure for xfstests generic/083
> using bigalloc.  I bisected down this patch, and then localized the
> failure down to the insert_inode_locked() call in __ext4_new_inode()
> failing (which means that there is an old inode in the inode hash
> lists with the same inode number)
> 
> The error seems to be caused by the error handling code paths.  I
> believe the problem was caused by clear_nlink() and unlock_new_inode()
> getting called in some error cleanup paths when the inode that hadn't
> yet been inserted into inode hash lists.
> 
> Here is a revamped patch which has a cleaned up set of error paths.
> The only change is in the cleanup paths, and this causes the
> generic/083 test for bigalloc to pass where it was previously failing.
  Thanks for catching this. But there's a bug in the new error handling as
well:

> @@ -733,13 +750,17 @@ repeat_in_this_group:
>  							 handle_type, nblocks);
>  			if (IS_ERR(handle)) {
>  				err = PTR_ERR(handle);
> -				goto fail;
> +				ext4_std_error(sb, err);
> +				goto out;
>  			}
> +
  Extra empty line...

> @@ -951,24 +972,17 @@ got:
>  
>  	ext4_debug("allocating inode %lu\n", inode->i_ino);
>  	trace_ext4_allocate_inode(inode, dir, mode);
> -	goto really_out;
> -fail:
> -	ext4_std_error(sb, err);
> -out:
> -	iput(inode);
> -	ret = ERR_PTR(err);
> -really_out:
>  	brelse(inode_bitmap_bh);
>  	return ret;
>  
>  fail_free_drop:
>  	dquot_free_inode(inode);
> -
>  fail_drop:
> -	dquot_drop(inode);
>  	inode->i_flags |= S_NOQUOTA;
>  	clear_nlink(inode);
>  	unlock_new_inode(inode);
> +out:
> +	dquot_drop(inode);
>  	iput(inode);
>  	brelse(inode_bitmap_bh);
>  	return ERR_PTR(err);
  You need to move
  	inode->i_flags |= S_NOQUOTA;
  along with dquot_drop() call as dquot_drop() will do nothing on an inode
marked with S_NOQUOTA so we leak dquot references...

									Honza

Patch

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4c358f7..9081fef 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -666,6 +666,23 @@  struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 	ei = EXT4_I(inode);
 	sbi = EXT4_SB(sb);
 
+	/*
+	 * Initalize owners and quota early so that we don't have to account
+	 * for quota initialization worst case in standard inode creating
+	 * transaction
+	 */
+	if (owner) {
+		inode->i_mode = mode;
+		i_uid_write(inode, owner[0]);
+		i_gid_write(inode, owner[1]);
+	} else if (test_opt(sb, GRPID)) {
+		inode->i_mode = mode;
+		inode->i_uid = current_fsuid();
+		inode->i_gid = dir->i_gid;
+	} else
+		inode_init_owner(inode, dir, mode);
+	dquot_initialize(inode);
+
 	if (!goal)
 		goal = sbi->s_inode_goal;
 
@@ -697,7 +714,7 @@  got_group:
 
 		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
 		if (!gdp)
-			goto fail;
+			goto out;
 
 		/*
 		 * Check free inodes count before loading bitmap.
@@ -711,7 +728,7 @@  got_group:
 		brelse(inode_bitmap_bh);
 		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
 		if (!inode_bitmap_bh)
-			goto fail;
+			goto out;
 
 repeat_in_this_group:
 		ino = ext4_find_next_zero_bit((unsigned long *)
@@ -733,13 +750,17 @@  repeat_in_this_group:
 							 handle_type, nblocks);
 			if (IS_ERR(handle)) {
 				err = PTR_ERR(handle);
-				goto fail;
+				ext4_std_error(sb, err);
+				goto out;
 			}
+
 		}
 		BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
-		if (err)
-			goto fail;
+		if (err) {
+			ext4_std_error(sb, err);
+			goto out;
+		}
 		ext4_lock_group(sb, group);
 		ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
 		ext4_unlock_group(sb, group);
@@ -755,8 +776,10 @@  repeat_in_this_group:
 got:
 	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
 	err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
-	if (err)
-		goto fail;
+	if (err) {
+		ext4_std_error(sb, err);
+		goto out;
+	}
 
 	/* We may have to initialize the block bitmap if it isn't already */
 	if (ext4_has_group_desc_csum(sb) &&
@@ -768,7 +791,8 @@  got:
 		err = ext4_journal_get_write_access(handle, block_bitmap_bh);
 		if (err) {
 			brelse(block_bitmap_bh);
-			goto fail;
+			ext4_std_error(sb, err);
+			goto out;
 		}
 
 		BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
@@ -787,14 +811,18 @@  got:
 		ext4_unlock_group(sb, group);
 		brelse(block_bitmap_bh);
 
-		if (err)
-			goto fail;
+		if (err) {
+			ext4_std_error(sb, err);
+			goto out;
+		}
 	}
 
 	BUFFER_TRACE(group_desc_bh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, group_desc_bh);
-	if (err)
-		goto fail;
+	if (err) {
+		ext4_std_error(sb, err);
+		goto out;
+	}
 
 	/* Update the relevant bg descriptor fields */
 	if (ext4_has_group_desc_csum(sb)) {
@@ -840,8 +868,10 @@  got:
 
 	BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
 	err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
-	if (err)
-		goto fail;
+	if (err) {
+		ext4_std_error(sb, err);
+		goto out;
+	}
 
 	percpu_counter_dec(&sbi->s_freeinodes_counter);
 	if (S_ISDIR(mode))
@@ -851,16 +881,6 @@  got:
 		flex_group = ext4_flex_group(sbi, group);
 		atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
 	}
-	if (owner) {
-		inode->i_mode = mode;
-		i_uid_write(inode, owner[0]);
-		i_gid_write(inode, owner[1]);
-	} else if (test_opt(sb, GRPID)) {
-		inode->i_mode = mode;
-		inode->i_uid = current_fsuid();
-		inode->i_gid = dir->i_gid;
-	} else
-		inode_init_owner(inode, dir, mode);
 
 	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
 	/* This is the optimal IO size (for stat), not the fs block size */
@@ -889,7 +909,9 @@  got:
 		 * twice.
 		 */
 		err = -EIO;
-		goto fail;
+		ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
+			   inode->i_ino);
+		goto out;
 	}
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
@@ -917,7 +939,6 @@  got:
 		ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
 
 	ret = inode;
-	dquot_initialize(inode);
 	err = dquot_alloc_inode(inode);
 	if (err)
 		goto fail_drop;
@@ -951,24 +972,17 @@  got:
 
 	ext4_debug("allocating inode %lu\n", inode->i_ino);
 	trace_ext4_allocate_inode(inode, dir, mode);
-	goto really_out;
-fail:
-	ext4_std_error(sb, err);
-out:
-	iput(inode);
-	ret = ERR_PTR(err);
-really_out:
 	brelse(inode_bitmap_bh);
 	return ret;
 
 fail_free_drop:
 	dquot_free_inode(inode);
-
 fail_drop:
-	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 	clear_nlink(inode);
 	unlock_new_inode(inode);
+out:
+	dquot_drop(inode);
 	iput(inode);
 	brelse(inode_bitmap_bh);
 	return ERR_PTR(err);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 45a5ca8..955c907 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2252,8 +2252,7 @@  static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	dquot_initialize(dir);
 
 	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-		   EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
 retry:
 	inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
 					    NULL, EXT4_HT_DIR, credits);
@@ -2287,8 +2286,7 @@  static int ext4_mknod(struct inode *dir, struct dentry *dentry,
 	dquot_initialize(dir);
 
 	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-		   EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
 retry:
 	inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
 					    NULL, EXT4_HT_DIR, credits);
@@ -2397,8 +2395,7 @@  static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	dquot_initialize(dir);
 
 	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-		   EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
 retry:
 	inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
 					    &dentry->d_name,
@@ -2827,8 +2824,7 @@  static int ext4_symlink(struct inode *dir,
 		 * quota blocks, sb is already counted in previous macros).
 		 */
 		credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-			  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-			  EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+			  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
 	}
 retry:
 	inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,