diff mbox

ext4: Check io list state and avoid an unnecessary mutex_lock in ext4_end_io_work.

Message ID 20111029205740.GB16825@thunk.org
State Superseded, archived
Headers show

Commit Message

Theodore Ts'o Oct. 29, 2011, 8:57 p.m. UTC
On Wed, Sep 14, 2011 at 04:33:02PM +0800, Tao Ma wrote:
> From: Tao Ma <boyu.mt@taobao.com>
> 
> When we finish the end io work in ext4_flush_completed_IO, we take
> the io work away from the list, but don't free it. Then in the workqueue,
> we can check the list state and then avoid the extra work if it is also
> done. It is good, but we check list state in ext4_end_io_nolock with i_mutex held
> instead of the spin_lock in other places. This is wrong.

Hi Tao,

Thanks for finding and pointing out this bug in ext4_end_io_nolock().
Unfortunately your proposed fix doesn't take into account that there
are other callers of ext4_end_io_nolock() besides ext4_end_io_work(),
and so it's not sufficient to move the test from former function to
the latter.

Attached please find the patch which I am planning to check into the
ext4 tree to address this bug which you have pointed out.

Regards,

						- Ted

commit c0e36d8410bfad4db4edefeb4175f85a5d216c8d
Author: Theodore Ts'o <tytso@mit.edu>
Date:   Sat Oct 29 16:57:19 2011 -0400

    ext4: use spinlock before checking io->list in ext4_io_end_nolock()
    
    In ext4_end_io_nolock(), io->list is checked to see if it is empty
    without taking the ei->i_completed_io_lock spinlock.  This violates
    the locking protocol, and can cause very hard to debug failures.
    
    Also optimize ext4_end_io_work() so that if ext4_end_io_nolock() is
    not going to do any work, don't try getting the i_mutex and possibly
    requeuing the end_io request if the trylock doesn't succeed in grabbing
    the mutex.
    
    Thanks to Tao Ma <boyu.mt@taobao.com> for spotting the error and
    providing an initial fix to address the problem.
    
    Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
    Signed-off-by: Tao Ma <boyu.mt@taobao.com>

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Tao Ma Oct. 30, 2011, 7:50 a.m. UTC | #1
Hi Ted,
On 10/30/2011 04:57 AM, Ted Ts'o wrote:
> On Wed, Sep 14, 2011 at 04:33:02PM +0800, Tao Ma wrote:
>> From: Tao Ma <boyu.mt@taobao.com>
>>
>> When we finish the end io work in ext4_flush_completed_IO, we take
>> the io work away from the list, but don't free it. Then in the workqueue,
>> we can check the list state and then avoid the extra work if it is also
>> done. It is good, but we check list state in ext4_end_io_nolock with i_mutex held
>> instead of the spin_lock in other places. This is wrong.
> 
> Hi Tao,
> 
> Thanks for finding and pointing out this bug in ext4_end_io_nolock().
> Unfortunately your proposed fix doesn't take into account that there
> are other callers of ext4_end_io_nolock() besides ext4_end_io_work(),
> and so it's not sufficient to move the test from former function to
> the latter.
sorry, but I thought I had considered this case.
There are 2 callers. One is ext4_end_io_work(which has the bug I pointed
out), the other is ext4_flush_complete_IO which has already done the
check before calling ext4_end_io_nolock. And that's the reason why I
move the check from ext4_end_io_nolock to ext4_end_io_work. So for the
ext4_flush_complete_IO case, your new patch will spin_lock twice for the
checking. Do I miss something here?

Thanks
Tao
> 
> Attached please find the patch which I am planning to check into the
> ext4 tree to address this bug which you have pointed out.
> 
> Regards,
> 
> 						- Ted
> 
> commit c0e36d8410bfad4db4edefeb4175f85a5d216c8d
> Author: Theodore Ts'o <tytso@mit.edu>
> Date:   Sat Oct 29 16:57:19 2011 -0400
> 
>     ext4: use spinlock before checking io->list in ext4_io_end_nolock()
>     
>     In ext4_end_io_nolock(), io->list is checked to see if it is empty
>     without taking the ei->i_completed_io_lock spinlock.  This violates
>     the locking protocol, and can cause very hard to debug failures.
>     
>     Also optimize ext4_end_io_work() so that if ext4_end_io_nolock() is
>     not going to do any work, don't try getting the i_mutex and possibly
>     requeuing the end_io request if the trylock doesn't succeed in grabbing
>     the mutex.
>     
>     Thanks to Tao Ma <boyu.mt@taobao.com> for spotting the error and
>     providing an initial fix to address the problem.
>     
>     Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
>     Signed-off-by: Tao Ma <boyu.mt@taobao.com>
> 
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index 92f38ee..0af5607 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -90,21 +90,27 @@ void ext4_free_io_end(ext4_io_end_t *io)
>   */
>  int ext4_end_io_nolock(ext4_io_end_t *io)
>  {
> -	struct inode *inode = io->inode;
> -	loff_t offset = io->offset;
> -	ssize_t size = io->size;
> -	wait_queue_head_t *wq;
> -	int ret = 0;
> +	unsigned long		flags;
> +	struct inode		*inode = io->inode;
> +	loff_t			offset = io->offset;
> +	ssize_t			size = io->size;
> +	struct ext4_inode_info	*ei = EXT4_I(inode);
> +	wait_queue_head_t	*wq;
> +	int			ret;
>  
>  	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
>  		   "list->prev 0x%p\n",
>  		   io, inode->i_ino, io->list.next, io->list.prev);
>  
> -	if (list_empty(&io->list))
> -		return ret;
> +	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> +	if (list_empty(&io->list)) {
> +		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> +		return 0;
> +	}
> +	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
>  
>  	if (!(io->flag & EXT4_IO_END_UNWRITTEN))
> -		return ret;
> +		return 0;
>  
>  	ret = ext4_convert_unwritten_extents(inode, offset, size);
>  	if (ret < 0) {
> @@ -142,6 +148,16 @@ static void ext4_end_io_work(struct work_struct *work)
>  	unsigned long		flags;
>  	int			ret;
>  
> +	if (!(io->flag & EXT4_IO_END_UNWRITTEN))
> +		goto free_io_end;
> +
> +	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> +	if (list_empty(&io->list)) {
> +		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> +		goto free_io_end;
> +	}
> +	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> +
>  	if (!mutex_trylock(&inode->i_mutex)) {
>  		/*
>  		 * Requeue the work instead of waiting so that the work
> @@ -170,6 +186,7 @@ static void ext4_end_io_work(struct work_struct *work)
>  		list_del_init(&io->list);
>  	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
>  	mutex_unlock(&inode->i_mutex);
> +free_io_end:
>  	ext4_free_io_end(io);
>  }
>  
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o Oct. 31, 2011, 3:02 p.m. UTC | #2
On Sun, Oct 30, 2011 at 03:50:25PM +0800, Tao Ma wrote:
> sorry, but I thought I had considered this case.
> There are 2 callers. One is ext4_end_io_work(which has the bug I pointed
> out), the other is ext4_flush_complete_IO which has already done the
> check before calling ext4_end_io_nolock. And that's the reason why I
> move the check from ext4_end_io_nolock to ext4_end_io_work. So for the
> ext4_flush_complete_IO case, your new patch will spin_lock twice for the
> checking. Do I miss something here?

Ah, you're right; my mistake.  When I looked closely, though, I found
that ext4_flush_completed_IO() had a call to list_empty() without
taking the spinlock, which would also be problematic.  When I looked
more closely, I found more ways to optimize things, which also close
up a few potential (I think theoretical) race conditions.

Let me know what you think....

					- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 92f38ee..0af5607 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -90,21 +90,27 @@  void ext4_free_io_end(ext4_io_end_t *io)
  */
 int ext4_end_io_nolock(ext4_io_end_t *io)
 {
-	struct inode *inode = io->inode;
-	loff_t offset = io->offset;
-	ssize_t size = io->size;
-	wait_queue_head_t *wq;
-	int ret = 0;
+	unsigned long		flags;
+	struct inode		*inode = io->inode;
+	loff_t			offset = io->offset;
+	ssize_t			size = io->size;
+	struct ext4_inode_info	*ei = EXT4_I(inode);
+	wait_queue_head_t	*wq;
+	int			ret;
 
 	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
 		   "list->prev 0x%p\n",
 		   io, inode->i_ino, io->list.next, io->list.prev);
 
-	if (list_empty(&io->list))
-		return ret;
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	if (list_empty(&io->list)) {
+		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+		return 0;
+	}
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 
 	if (!(io->flag & EXT4_IO_END_UNWRITTEN))
-		return ret;
+		return 0;
 
 	ret = ext4_convert_unwritten_extents(inode, offset, size);
 	if (ret < 0) {
@@ -142,6 +148,16 @@  static void ext4_end_io_work(struct work_struct *work)
 	unsigned long		flags;
 	int			ret;
 
+	if (!(io->flag & EXT4_IO_END_UNWRITTEN))
+		goto free_io_end;
+
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	if (list_empty(&io->list)) {
+		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+		goto free_io_end;
+	}
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
 	if (!mutex_trylock(&inode->i_mutex)) {
 		/*
 		 * Requeue the work instead of waiting so that the work
@@ -170,6 +186,7 @@  static void ext4_end_io_work(struct work_struct *work)
 		list_del_init(&io->list);
 	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 	mutex_unlock(&inode->i_mutex);
+free_io_end:
 	ext4_free_io_end(io);
 }