Patchwork [2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex

login
register
mail settings
Submitter T Makphaibulchoke
Date Oct. 2, 2013, 3:36 p.m.
Message ID <1380728219-60784-2-git-send-email-tmac@hp.com>
Download mbox | patch
Permalink /patch/279796/
State New
Headers show

Comments

T Makphaibulchoke - Oct. 2, 2013, 3:36 p.m.
Instead of using a single per super block mutex, s_orphan_lock, to serialize
all orphan list updates, a separate mutex and spinlock are used to
protect the on disk and in memory orphan lists respecvitely.

At the same time, a per inode mutex is used to serialize orphan list
updates of a single inode.

Signed-off-by: T. Makphaibulchoke <tmac@hp.com>
---
 fs/ext4/namei.c | 139 ++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 100 insertions(+), 39 deletions(-)
Zheng Liu - Oct. 3, 2013, 2:05 a.m.
Hello,

On Wed, Oct 02, 2013 at 09:36:59AM -0600, T Makphaibulchoke wrote:
> Instead of using a single per super block mutex, s_orphan_lock, to serialize
> all orphan list updates, a separate mutex and spinlock are used to
> protect the on disk and in memory orphan lists respecvitely.
> 
> At the same time, a per inode mutex is used to serialize orphan list
> updates of a single inode.
> 
> Signed-off-by: T. Makphaibulchoke <tmac@hp.com>
> ---
>  fs/ext4/namei.c | 139 ++++++++++++++++++++++++++++++++++++++++----------------
>  1 file changed, 100 insertions(+), 39 deletions(-)
> 
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 35f55a0..d7d3d0f 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -2554,12 +2554,24 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
>  	struct super_block *sb = inode->i_sb;
>  	struct ext4_iloc iloc;
>  	int err = 0, rc;
> +	struct ext4_inode_info *ext4_inode = EXT4_I(inode);
> +	struct ext4_sb_info *ext4_sb = EXT4_SB(sb);
> +	struct mutex *inode_orphan_mutex;
> +	spinlock_t *orphan_lock;
> +	struct mutex *ondisk_orphan_mutex;
> +	struct list_head *prev;
> +	unsigned int next_inum;
> +	struct inode *next_inode;
>  
> -	if (!EXT4_SB(sb)->s_journal)
> +	if (ext4_sb->s_journal)
        ^^^^^^^^
        typo: !ext4_sb->s_journal
I am not sure whether or not this will impact the result because when
journal is enabled the inode will not be added into orphan list.

Regards,
                                                - Zheng

>  		return 0;
>  
> -	mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
> -	if (!list_empty(&EXT4_I(inode)->i_orphan))
> +	inode_orphan_mutex = &ext4_inode->i_orphan_mutex;
> +	orphan_lock = &ext4_sb->s_orphan_lock;
> +	ondisk_orphan_mutex = &ext4_sb->s_ondisk_orphan_lock;
> +	mutex_lock(inode_orphan_mutex);
> +	prev = ext4_inode->i_orphan.prev;
> +	if (prev != &ext4_inode->i_orphan)
>  		goto out_unlock;
>  
>  	/*
> @@ -2579,17 +2591,28 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
>  	err = ext4_reserve_inode_write(handle, inode, &iloc);
>  	if (err)
>  		goto out_unlock;
> +	mutex_lock(ondisk_orphan_mutex);
>  	/*
>  	 * Due to previous errors inode may be already a part of on-disk
>  	 * orphan list. If so skip on-disk list modification.
>  	 */
>  	if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
> -		(le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
> +		(le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
> +			mutex_unlock(ondisk_orphan_mutex);
>  			goto mem_insert;
> +	}
>  
>  	/* Insert this inode at the head of the on-disk orphan list... */
> -	NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
> -	EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
> +	next_inum = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
> +	NEXT_ORPHAN(inode) = next_inum;
> +	next_inode = ilookup(sb, next_inum);
> +	if (next_inode)
> +		EXT4_I(next_inode)->i_prev_orphan = inode->i_ino;
> +	ext4_sb->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
> +	ext4_inode->i_prev_orphan = 0;
> +	mutex_unlock(ondisk_orphan_mutex);
> +	if (next_inode)
> +		iput(next_inode);
>  	err = ext4_handle_dirty_super(handle, sb);
>  	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
>  	if (!err)
> @@ -2604,14 +2627,17 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
>  	 * This is safe: on error we're going to ignore the orphan list
>  	 * anyway on the next recovery. */
>  mem_insert:
> -	if (!err)
> -		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
> +	if (!err) {
> +		spin_lock(orphan_lock);
> +		list_add(&ext4_inode->i_orphan, &ext4_sb->s_orphan);
> +		spin_unlock(orphan_lock);
> +	}
>  
>  	jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
>  	jbd_debug(4, "orphan inode %lu will point to %d\n",
>  			inode->i_ino, NEXT_ORPHAN(inode));
>  out_unlock:
> -	mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
> +	mutex_unlock(inode_orphan_mutex);
>  	ext4_std_error(inode->i_sb, err);
>  	return err;
>  }
> @@ -2624,71 +2650,106 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
>  {
>  	struct list_head *prev;
>  	struct ext4_inode_info *ei = EXT4_I(inode);
> -	struct ext4_sb_info *sbi;
> +	struct inode *i_next = NULL;
> +	struct inode *i_prev = NULL;
> +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>  	__u32 ino_next;
> +	unsigned int ino_prev;
>  	struct ext4_iloc iloc;
>  	int err = 0;
> +	struct mutex *inode_orphan_mutex;
> +	spinlock_t *orphan_lock;
> +	struct mutex *ondisk_orphan_mutex;
>  
> -	if ((!EXT4_SB(inode->i_sb)->s_journal) &&
> -	    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS))
> +	if ((!sbi->s_journal) &&
> +	    !(sbi->s_mount_state & EXT4_ORPHAN_FS))
>  		return 0;
>  
> -	mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
> -	if (list_empty(&ei->i_orphan))
> -		goto out;
> -
> -	ino_next = NEXT_ORPHAN(inode);
> +	inode_orphan_mutex = &ei->i_orphan_mutex;
> +	orphan_lock = &sbi->s_orphan_lock;
> +	ondisk_orphan_mutex = &sbi->s_ondisk_orphan_lock;
> +	mutex_lock(inode_orphan_mutex);
>  	prev = ei->i_orphan.prev;
> -	sbi = EXT4_SB(inode->i_sb);
> +	if (prev == &ei->i_orphan) {
> +		mutex_unlock(inode_orphan_mutex);
> +		return err;
> +	}
>  
>  	jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
>  
> +	spin_lock(orphan_lock);
>  	list_del_init(&ei->i_orphan);
> +	spin_unlock(orphan_lock);
>  
>  	/* If we're on an error path, we may not have a valid
>  	 * transaction handle with which to update the orphan list on
>  	 * disk, but we still need to remove the inode from the linked
>  	 * list in memory. */
> -	if (!handle)
> -		goto out;
> +	if (!handle) {
> +		mutex_unlock(inode_orphan_mutex);
> +		return err;
> +	}
>  
>  	err = ext4_reserve_inode_write(handle, inode, &iloc);
> -	if (err)
> +	if (err) {
> +		mutex_unlock(inode_orphan_mutex);
>  		goto out_err;
> +	}
>  
> -	if (prev == &sbi->s_orphan) {
> +	mutex_lock(ondisk_orphan_mutex);
> +	ino_next = NEXT_ORPHAN(inode);
> +	if (ino_next > 0) {
> +		i_next = ilookup(inode->i_sb, ino_next);
> +		assert(i_next);
> +	}
> +	ino_prev = ei->i_prev_orphan;
> +	if (!ino_prev) {
>  		jbd_debug(4, "superblock will point to %u\n", ino_next);
>  		BUFFER_TRACE(sbi->s_sbh, "get_write_access");
>  		err = ext4_journal_get_write_access(handle, sbi->s_sbh);
> -		if (err)
> -			goto out_brelse;
> -		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
> -		err = ext4_handle_dirty_super(handle, inode->i_sb);
> +		if (!err) {
> +			sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
> +			if (i_next)
> +				EXT4_I(i_next)->i_prev_orphan = 0;
> +		}
> +		mutex_unlock(ondisk_orphan_mutex);
> +		if (!err)
> +			err = ext4_handle_dirty_super(handle, inode->i_sb);
>  	} else {
>  		struct ext4_iloc iloc2;
> -		struct inode *i_prev =
> -			&list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
> -
> -		jbd_debug(4, "orphan inode %lu will point to %u\n",
> -			  i_prev->i_ino, ino_next);
> -		err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
> -		if (err)
> -			goto out_brelse;
> -		NEXT_ORPHAN(i_prev) = ino_next;
> -		err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
> +		i_prev = ilookup(inode->i_sb, ino_prev);
> +
> +		assert(i_prev);
> +		if (i_prev) {
> +			jbd_debug(4, "orphan inode %lu will point to %u\n",
> +				i_prev->i_ino, ino_next);
> +			err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
> +		} else
> +			err = -ENOENT;
> +		if (!err) {
> +			NEXT_ORPHAN(i_prev) = ino_next;
> +			if (i_next)
> +				EXT4_I(i_next)->i_prev_orphan = ino_prev;
> +		}
> +		mutex_unlock(ondisk_orphan_mutex);
> +		if (i_prev && !err)
> +			err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
>  	}
>  	if (err)
>  		goto out_brelse;
>  	NEXT_ORPHAN(inode) = 0;
> +	mutex_unlock(inode_orphan_mutex);
>  	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
> -
>  out_err:
> +	if (i_next)
> +		iput(i_next);
> +	if (i_prev)
> +		iput(i_prev);
>  	ext4_std_error(inode->i_sb, err);
> -out:
> -	mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
>  	return err;
>  
>  out_brelse:
> +	mutex_unlock(inode_orphan_mutex);
>  	brelse(iloc.bh);
>  	goto out_err;
>  }
> -- 
> 1.7.11.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thavatchai Makphaibulchoke - Oct. 3, 2013, 8:31 a.m.
On 10/02/2013 08:05 PM, Zheng Liu wrote:
> Hello,
> 

>> -	if (!EXT4_SB(sb)->s_journal)
>> +	if (ext4_sb->s_journal)
>         ^^^^^^^^
>         typo: !ext4_sb->s_journal
> I am not sure whether or not this will impact the result because when
> journal is enabled the inode will not be added into orphan list.
> 
> Regards,
>                                                 - Zheng
> 
Thanks Zheng for pointing out the problem.  Will fix the problem and rerun the test.

Thanks,
Mak.


--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thavatchai Makphaibulchoke - Oct. 3, 2013, 11:08 p.m.
On 10/03/2013 06:41 PM, Andreas Dilger wrote:

>> +	struct inode *next_inode;
> 
> Stack space in the kernel is not so abundant that all (or any?) of these
> should get their own local variable.
> 
>>
>> -	if (!EXT4_SB(sb)->s_journal)
> 
> Same here.
> 
> 
> Cheers, Andreas

Thanks Andreas for the comments.  On larger machines with processors with lots of register, with the compiler optimization I don't think it matters whether stack variables or repeated common expressions are used.  On smaller machines with processors with limited number of registers, this will be a problem.  I'll fix these on my rework.

Thanks,
Mak.


--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger - Oct. 4, 2013, 12:41 a.m.
On 2013-10-02, at 9:36 AM, T Makphaibulchoke wrote:

> Instead of using a single per super block mutex, s_orphan_lock, to serialize
> all orphan list updates, a separate mutex and spinlock are used to
> protect the on disk and in memory orphan lists respecvitely.
> 
> At the same time, a per inode mutex is used to serialize orphan list
> updates of a single inode.
> 
> Signed-off-by: T. Makphaibulchoke <tmac@hp.com>
> ---
> fs/ext4/namei.c | 139 ++++++++++++++++++++++++++++++++++++++++----------------
> 1 file changed, 100 insertions(+), 39 deletions(-)
> 
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 35f55a0..d7d3d0f 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -2554,12 +2554,24 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
> 	struct super_block *sb = inode->i_sb;
> 	struct ext4_iloc iloc;
> 	int err = 0, rc;
> +	struct ext4_inode_info *ext4_inode = EXT4_I(inode);
> +	struct ext4_sb_info *ext4_sb = EXT4_SB(sb);
> +	struct mutex *inode_orphan_mutex;
> +	spinlock_t *orphan_lock;
> +	struct mutex *ondisk_orphan_mutex;
> +	struct list_head *prev;
> +	unsigned int next_inum;
> +	struct inode *next_inode;

Stack space in the kernel is not so abundant that all (or any?) of these
should get their own local variable.

> 
> -	if (!EXT4_SB(sb)->s_journal)
> +	if (ext4_sb->s_journal)
> 		return 0;
> 
> -	mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
> -	if (!list_empty(&EXT4_I(inode)->i_orphan))
> +	inode_orphan_mutex = &ext4_inode->i_orphan_mutex;
> +	orphan_lock = &ext4_sb->s_orphan_lock;
> +	ondisk_orphan_mutex = &ext4_sb->s_ondisk_orphan_lock;
> +	mutex_lock(inode_orphan_mutex);
> +	prev = ext4_inode->i_orphan.prev;
> +	if (prev != &ext4_inode->i_orphan)
> 		goto out_unlock;
> 
> 	/*
> @@ -2579,17 +2591,28 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
> 	err = ext4_reserve_inode_write(handle, inode, &iloc);
> 	if (err)
> 		goto out_unlock;
> +	mutex_lock(ondisk_orphan_mutex);
> 	/*
> 	 * Due to previous errors inode may be already a part of on-disk
> 	 * orphan list. If so skip on-disk list modification.
> 	 */
> 	if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
> -		(le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
> +		(le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
> +			mutex_unlock(ondisk_orphan_mutex);
> 			goto mem_insert;
> +	}
> 
> 	/* Insert this inode at the head of the on-disk orphan list... */
> -	NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
> -	EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
> +	next_inum = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
> +	NEXT_ORPHAN(inode) = next_inum;
> +	next_inode = ilookup(sb, next_inum);
> +	if (next_inode)
> +		EXT4_I(next_inode)->i_prev_orphan = inode->i_ino;
> +	ext4_sb->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
> +	ext4_inode->i_prev_orphan = 0;
> +	mutex_unlock(ondisk_orphan_mutex);
> +	if (next_inode)
> +		iput(next_inode);
> 	err = ext4_handle_dirty_super(handle, sb);
> 	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
> 	if (!err)
> @@ -2604,14 +2627,17 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
> 	 * This is safe: on error we're going to ignore the orphan list
> 	 * anyway on the next recovery. */
> mem_insert:
> -	if (!err)
> -		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
> +	if (!err) {
> +		spin_lock(orphan_lock);
> +		list_add(&ext4_inode->i_orphan, &ext4_sb->s_orphan);
> +		spin_unlock(orphan_lock);
> +	}
> 
> 	jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
> 	jbd_debug(4, "orphan inode %lu will point to %d\n",
> 			inode->i_ino, NEXT_ORPHAN(inode));
> out_unlock:
> -	mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
> +	mutex_unlock(inode_orphan_mutex);
> 	ext4_std_error(inode->i_sb, err);
> 	return err;
> }
> @@ -2624,71 +2650,106 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
> {
> 	struct list_head *prev;
> 	struct ext4_inode_info *ei = EXT4_I(inode);
> -	struct ext4_sb_info *sbi;
> +	struct inode *i_next = NULL;
> +	struct inode *i_prev = NULL;
> +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> 	__u32 ino_next;
> +	unsigned int ino_prev;
> 	struct ext4_iloc iloc;
> 	int err = 0;
> +	struct mutex *inode_orphan_mutex;
> +	spinlock_t *orphan_lock;
> +	struct mutex *ondisk_orphan_mutex;

Same here.

> -	if ((!EXT4_SB(inode->i_sb)->s_journal) &&
> -	    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS))
> +	if ((!sbi->s_journal) &&
> +	    !(sbi->s_mount_state & EXT4_ORPHAN_FS))
> 		return 0;
> 
> -	mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
> -	if (list_empty(&ei->i_orphan))
> -		goto out;
> -
> -	ino_next = NEXT_ORPHAN(inode);
> +	inode_orphan_mutex = &ei->i_orphan_mutex;
> +	orphan_lock = &sbi->s_orphan_lock;
> +	ondisk_orphan_mutex = &sbi->s_ondisk_orphan_lock;
> +	mutex_lock(inode_orphan_mutex);
> 	prev = ei->i_orphan.prev;
> -	sbi = EXT4_SB(inode->i_sb);
> +	if (prev == &ei->i_orphan) {
> +		mutex_unlock(inode_orphan_mutex);
> +		return err;
> +	}
> 
> 	jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
> 
> +	spin_lock(orphan_lock);
> 	list_del_init(&ei->i_orphan);
> +	spin_unlock(orphan_lock);
> 
> 	/* If we're on an error path, we may not have a valid
> 	 * transaction handle with which to update the orphan list on
> 	 * disk, but we still need to remove the inode from the linked
> 	 * list in memory. */
> -	if (!handle)
> -		goto out;
> +	if (!handle) {
> +		mutex_unlock(inode_orphan_mutex);
> +		return err;
> +	}
> 
> 	err = ext4_reserve_inode_write(handle, inode, &iloc);
> -	if (err)
> +	if (err) {
> +		mutex_unlock(inode_orphan_mutex);
> 		goto out_err;
> +	}
> 
> -	if (prev == &sbi->s_orphan) {
> +	mutex_lock(ondisk_orphan_mutex);
> +	ino_next = NEXT_ORPHAN(inode);
> +	if (ino_next > 0) {
> +		i_next = ilookup(inode->i_sb, ino_next);
> +		assert(i_next);
> +	}
> +	ino_prev = ei->i_prev_orphan;
> +	if (!ino_prev) {
> 		jbd_debug(4, "superblock will point to %u\n", ino_next);
> 		BUFFER_TRACE(sbi->s_sbh, "get_write_access");
> 		err = ext4_journal_get_write_access(handle, sbi->s_sbh);
> -		if (err)
> -			goto out_brelse;
> -		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
> -		err = ext4_handle_dirty_super(handle, inode->i_sb);
> +		if (!err) {
> +			sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
> +			if (i_next)
> +				EXT4_I(i_next)->i_prev_orphan = 0;
> +		}
> +		mutex_unlock(ondisk_orphan_mutex);
> +		if (!err)
> +			err = ext4_handle_dirty_super(handle, inode->i_sb);
> 	} else {
> 		struct ext4_iloc iloc2;
> -		struct inode *i_prev =
> -			&list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
> -
> -		jbd_debug(4, "orphan inode %lu will point to %u\n",
> -			  i_prev->i_ino, ino_next);
> -		err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
> -		if (err)
> -			goto out_brelse;
> -		NEXT_ORPHAN(i_prev) = ino_next;
> -		err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
> +		i_prev = ilookup(inode->i_sb, ino_prev);
> +
> +		assert(i_prev);
> +		if (i_prev) {
> +			jbd_debug(4, "orphan inode %lu will point to %u\n",
> +				i_prev->i_ino, ino_next);
> +			err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
> +		} else
> +			err = -ENOENT;
> +		if (!err) {
> +			NEXT_ORPHAN(i_prev) = ino_next;
> +			if (i_next)
> +				EXT4_I(i_next)->i_prev_orphan = ino_prev;
> +		}
> +		mutex_unlock(ondisk_orphan_mutex);
> +		if (i_prev && !err)
> +			err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
> 	}
> 	if (err)
> 		goto out_brelse;
> 	NEXT_ORPHAN(inode) = 0;
> +	mutex_unlock(inode_orphan_mutex);
> 	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
> -
> out_err:
> +	if (i_next)
> +		iput(i_next);
> +	if (i_prev)
> +		iput(i_prev);
> 	ext4_std_error(inode->i_sb, err);
> -out:
> -	mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
> 	return err;
> 
> out_brelse:
> +	mutex_unlock(inode_orphan_mutex);
> 	brelse(iloc.bh);
> 	goto out_err;
> }
> -- 
> 1.7.11.3
> 


Cheers, Andreas





--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 35f55a0..d7d3d0f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2554,12 +2554,24 @@  int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	struct super_block *sb = inode->i_sb;
 	struct ext4_iloc iloc;
 	int err = 0, rc;
+	struct ext4_inode_info *ext4_inode = EXT4_I(inode);
+	struct ext4_sb_info *ext4_sb = EXT4_SB(sb);
+	struct mutex *inode_orphan_mutex;
+	spinlock_t *orphan_lock;
+	struct mutex *ondisk_orphan_mutex;
+	struct list_head *prev;
+	unsigned int next_inum;
+	struct inode *next_inode;
 
-	if (!EXT4_SB(sb)->s_journal)
+	if (ext4_sb->s_journal)
 		return 0;
 
-	mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
-	if (!list_empty(&EXT4_I(inode)->i_orphan))
+	inode_orphan_mutex = &ext4_inode->i_orphan_mutex;
+	orphan_lock = &ext4_sb->s_orphan_lock;
+	ondisk_orphan_mutex = &ext4_sb->s_ondisk_orphan_lock;
+	mutex_lock(inode_orphan_mutex);
+	prev = ext4_inode->i_orphan.prev;
+	if (prev != &ext4_inode->i_orphan)
 		goto out_unlock;
 
 	/*
@@ -2579,17 +2591,28 @@  int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
 	if (err)
 		goto out_unlock;
+	mutex_lock(ondisk_orphan_mutex);
 	/*
 	 * Due to previous errors inode may be already a part of on-disk
 	 * orphan list. If so skip on-disk list modification.
 	 */
 	if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
-		(le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
+		(le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
+			mutex_unlock(ondisk_orphan_mutex);
 			goto mem_insert;
+	}
 
 	/* Insert this inode at the head of the on-disk orphan list... */
-	NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
-	EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+	next_inum = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
+	NEXT_ORPHAN(inode) = next_inum;
+	next_inode = ilookup(sb, next_inum);
+	if (next_inode)
+		EXT4_I(next_inode)->i_prev_orphan = inode->i_ino;
+	ext4_sb->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+	ext4_inode->i_prev_orphan = 0;
+	mutex_unlock(ondisk_orphan_mutex);
+	if (next_inode)
+		iput(next_inode);
 	err = ext4_handle_dirty_super(handle, sb);
 	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
 	if (!err)
@@ -2604,14 +2627,17 @@  int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	 * This is safe: on error we're going to ignore the orphan list
 	 * anyway on the next recovery. */
 mem_insert:
-	if (!err)
-		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+	if (!err) {
+		spin_lock(orphan_lock);
+		list_add(&ext4_inode->i_orphan, &ext4_sb->s_orphan);
+		spin_unlock(orphan_lock);
+	}
 
 	jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
 	jbd_debug(4, "orphan inode %lu will point to %d\n",
 			inode->i_ino, NEXT_ORPHAN(inode));
 out_unlock:
-	mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
+	mutex_unlock(inode_orphan_mutex);
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
@@ -2624,71 +2650,106 @@  int ext4_orphan_del(handle_t *handle, struct inode *inode)
 {
 	struct list_head *prev;
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_sb_info *sbi;
+	struct inode *i_next = NULL;
+	struct inode *i_prev = NULL;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	__u32 ino_next;
+	unsigned int ino_prev;
 	struct ext4_iloc iloc;
 	int err = 0;
+	struct mutex *inode_orphan_mutex;
+	spinlock_t *orphan_lock;
+	struct mutex *ondisk_orphan_mutex;
 
-	if ((!EXT4_SB(inode->i_sb)->s_journal) &&
-	    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS))
+	if ((!sbi->s_journal) &&
+	    !(sbi->s_mount_state & EXT4_ORPHAN_FS))
 		return 0;
 
-	mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
-	if (list_empty(&ei->i_orphan))
-		goto out;
-
-	ino_next = NEXT_ORPHAN(inode);
+	inode_orphan_mutex = &ei->i_orphan_mutex;
+	orphan_lock = &sbi->s_orphan_lock;
+	ondisk_orphan_mutex = &sbi->s_ondisk_orphan_lock;
+	mutex_lock(inode_orphan_mutex);
 	prev = ei->i_orphan.prev;
-	sbi = EXT4_SB(inode->i_sb);
+	if (prev == &ei->i_orphan) {
+		mutex_unlock(inode_orphan_mutex);
+		return err;
+	}
 
 	jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
 
+	spin_lock(orphan_lock);
 	list_del_init(&ei->i_orphan);
+	spin_unlock(orphan_lock);
 
 	/* If we're on an error path, we may not have a valid
 	 * transaction handle with which to update the orphan list on
 	 * disk, but we still need to remove the inode from the linked
 	 * list in memory. */
-	if (!handle)
-		goto out;
+	if (!handle) {
+		mutex_unlock(inode_orphan_mutex);
+		return err;
+	}
 
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
-	if (err)
+	if (err) {
+		mutex_unlock(inode_orphan_mutex);
 		goto out_err;
+	}
 
-	if (prev == &sbi->s_orphan) {
+	mutex_lock(ondisk_orphan_mutex);
+	ino_next = NEXT_ORPHAN(inode);
+	if (ino_next > 0) {
+		i_next = ilookup(inode->i_sb, ino_next);
+		assert(i_next);
+	}
+	ino_prev = ei->i_prev_orphan;
+	if (!ino_prev) {
 		jbd_debug(4, "superblock will point to %u\n", ino_next);
 		BUFFER_TRACE(sbi->s_sbh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, sbi->s_sbh);
-		if (err)
-			goto out_brelse;
-		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
-		err = ext4_handle_dirty_super(handle, inode->i_sb);
+		if (!err) {
+			sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+			if (i_next)
+				EXT4_I(i_next)->i_prev_orphan = 0;
+		}
+		mutex_unlock(ondisk_orphan_mutex);
+		if (!err)
+			err = ext4_handle_dirty_super(handle, inode->i_sb);
 	} else {
 		struct ext4_iloc iloc2;
-		struct inode *i_prev =
-			&list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
-
-		jbd_debug(4, "orphan inode %lu will point to %u\n",
-			  i_prev->i_ino, ino_next);
-		err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
-		if (err)
-			goto out_brelse;
-		NEXT_ORPHAN(i_prev) = ino_next;
-		err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
+		i_prev = ilookup(inode->i_sb, ino_prev);
+
+		assert(i_prev);
+		if (i_prev) {
+			jbd_debug(4, "orphan inode %lu will point to %u\n",
+				i_prev->i_ino, ino_next);
+			err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
+		} else
+			err = -ENOENT;
+		if (!err) {
+			NEXT_ORPHAN(i_prev) = ino_next;
+			if (i_next)
+				EXT4_I(i_next)->i_prev_orphan = ino_prev;
+		}
+		mutex_unlock(ondisk_orphan_mutex);
+		if (i_prev && !err)
+			err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
 	}
 	if (err)
 		goto out_brelse;
 	NEXT_ORPHAN(inode) = 0;
+	mutex_unlock(inode_orphan_mutex);
 	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-
 out_err:
+	if (i_next)
+		iput(i_next);
+	if (i_prev)
+		iput(i_prev);
 	ext4_std_error(inode->i_sb, err);
-out:
-	mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
 	return err;
 
 out_brelse:
+	mutex_unlock(inode_orphan_mutex);
 	brelse(iloc.bh);
 	goto out_err;
 }