Patchwork EXT4 regression caused 4eec7

login
register
mail settings
Submitter Theodore Ts'o
Date May 11, 2013, 11:05 p.m.
Message ID <20130511230559.GD26298@thunk.org>
Download mbox | patch
Permalink /patch/243154/
State Accepted
Headers show

Comments

Theodore Ts'o - May 11, 2013, 11:05 p.m.
On Sat, May 11, 2013 at 03:00:53PM +0400, Dmitry Monakhov wrote:
> I've bisected ext4 related issue. It is appeared that it is pure ext4
> specific. Regression caused by  following commit
> commit 4eec708d263f0ee10861d69251708a225b64cac7
> Author: Jan Kara <jack@suse.cz>
> Date:   Thu Apr 11 23:56:53 2013 -0400
>     ext4: use io_end for multiple bios

Hmm... the next question is why did I see this in my testing.  Did you
find this on ia64, or x86?  Also what about the slab corruption which
you saw when running XFS; was that unrelated?

Since at this point it's safer to rollback the change and we can
investigate more deeply how to fix it correctly for the next
development cycle, this is the patch which I'm testing.

	    	   	       	     - Ted

From dfd5057908a66b414d5d5fa2fc7ff7945e5a277c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 11 May 2013 18:59:39 -0400
Subject: [PATCH] Revert "ext4: use io_end for multiple bios"

This reverts commit 4eec708d263f0ee10861d69251708a225b64cac7.

Multiple users have reported crashes which is apparently caused by
this commit.  Thanks to Dmitry Monakhov for bisecting it.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Dmitry Monakhov <dmonakhov@openvz.org>
Cc: Jan Kara <jack@suse.cz>
---
 fs/ext4/ext4.h    |   8 +---
 fs/ext4/inode.c   |  85 +++++++++++++++++---------------------
 fs/ext4/page-io.c | 121 ++++++++++++++++++++----------------------------------
 3 files changed, 85 insertions(+), 129 deletions(-)
Dmitri Monakho - May 12, 2013, 9:01 a.m.
On Sat, 11 May 2013 19:05:59 -0400, Theodore Ts'o <tytso@mit.edu> wrote:
> On Sat, May 11, 2013 at 03:00:53PM +0400, Dmitry Monakhov wrote:
> > I've bisected ext4 related issue. It is appeared that it is pure ext4
> > specific. Regression caused by  following commit
> > commit 4eec708d263f0ee10861d69251708a225b64cac7
> > Author: Jan Kara <jack@suse.cz>
> > Date:   Thu Apr 11 23:56:53 2013 -0400
> >     ext4: use io_end for multiple bios
> 
> Hmm... the next question is why did I see this in my testing.  Did you
> find this on ia64, or x86?
No simple x86_64. 
> Also what about the slab corruption which
> you saw when running XFS; was that unrelated?
My theory about mysterious corruption in mm layer which broke everything
was wrong. We have to absolutely independent regressions in  different
filesystems:
* Slub corruption on XFS
  - testcase: xfstests/generic/007
  - bad commit: 666d64 
  - LINK: https://lkml.org/lkml/2013/5/11/154

* Slub corruption on EXT4
  - testcase: xfstests/generic/299
  - bad commit: 4eec70
  - link: https://lkml.org/lkml/2013/5/11/37
  - fix: https://lkml.org/lkml/2013/5/11/142

In fact '4eec70' are vexing because I have reviewed and tested this patch before
it was marked as Review-by, but missed the bug. This is because xfstests
was executed manually logs was full of warnings but tainted flag was not
checked at the end. To prevent this thing in future I'll always use my
local autotest(http://autotest.github.io/) farm next time.

BTW here is my config with various debug options enabled.
> 
> Since at this point it's safer to rollback the change and we can
> investigate more deeply how to fix it correctly for the next
> development cycle, this is the patch which I'm testing.
> 
> 	    	   	       	     - Ted
> 
> From dfd5057908a66b414d5d5fa2fc7ff7945e5a277c Mon Sep 17 00:00:00 2001
> From: Theodore Ts'o <tytso@mit.edu>
> Date: Sat, 11 May 2013 18:59:39 -0400
> Subject: [PATCH] Revert "ext4: use io_end for multiple bios"
> 
> This reverts commit 4eec708d263f0ee10861d69251708a225b64cac7.
> 
> Multiple users have reported crashes which is apparently caused by
> this commit.  Thanks to Dmitry Monakhov for bisecting it.
> 
> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
> Cc: Dmitry Monakhov <dmonakhov@openvz.org>
> Cc: Jan Kara <jack@suse.cz>
> ---
>  fs/ext4/ext4.h    |   8 +---
>  fs/ext4/inode.c   |  85 +++++++++++++++++---------------------
>  fs/ext4/page-io.c | 121 ++++++++++++++++++++----------------------------------
>  3 files changed, 85 insertions(+), 129 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 0aabb34..5aae3d1 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -209,7 +209,6 @@ typedef struct ext4_io_end {
>  	ssize_t			size;		/* size of the extent */
>  	struct kiocb		*iocb;		/* iocb struct for AIO */
>  	int			result;		/* error value for AIO */
> -	atomic_t		count;		/* reference counter */
>  } ext4_io_end_t;
>  
>  struct ext4_io_submit {
> @@ -2651,14 +2650,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
>  
>  /* page-io.c */
>  extern int __init ext4_init_pageio(void);
> +extern void ext4_add_complete_io(ext4_io_end_t *io_end);
>  extern void ext4_exit_pageio(void);
>  extern void ext4_ioend_shutdown(struct inode *);
> +extern void ext4_free_io_end(ext4_io_end_t *io);
>  extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
> -extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
> -extern int ext4_put_io_end(ext4_io_end_t *io_end);
> -extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
> -extern void ext4_io_submit_init(struct ext4_io_submit *io,
> -				struct writeback_control *wbc);
>  extern void ext4_end_io_work(struct work_struct *work);
>  extern void ext4_io_submit(struct ext4_io_submit *io);
>  extern int ext4_bio_write_page(struct ext4_io_submit *io,
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 793d44b..d666569 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1487,10 +1487,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
>  	struct ext4_io_submit io_submit;
>  
>  	BUG_ON(mpd->next_page <= mpd->first_page);
> -	ext4_io_submit_init(&io_submit, mpd->wbc);
> -	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
> -	if (!io_submit.io_end)
> -		return -ENOMEM;
> +	memset(&io_submit, 0, sizeof(io_submit));
>  	/*
>  	 * We need to start from the first_page to the next_page - 1
>  	 * to make sure we also write the mapped dirty buffer_heads.
> @@ -1578,8 +1575,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
>  		pagevec_release(&pvec);
>  	}
>  	ext4_io_submit(&io_submit);
> -	/* Drop io_end reference we got from init */
> -	ext4_put_io_end_defer(io_submit.io_end);
>  	return ret;
>  }
>  
> @@ -2238,16 +2233,9 @@ static int ext4_writepage(struct page *page,
>  		 */
>  		return __ext4_journalled_writepage(page, len);
>  
> -	ext4_io_submit_init(&io_submit, wbc);
> -	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
> -	if (!io_submit.io_end) {
> -		redirty_page_for_writepage(wbc, page);
> -		return -ENOMEM;
> -	}
> +	memset(&io_submit, 0, sizeof(io_submit));
>  	ret = ext4_bio_write_page(&io_submit, page, len, wbc);
>  	ext4_io_submit(&io_submit);
> -	/* Drop io_end reference we got from init */
> -	ext4_put_io_end_defer(io_submit.io_end);
>  	return ret;
>  }
>  
> @@ -3078,13 +3066,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
>  	struct inode *inode = file_inode(iocb->ki_filp);
>          ext4_io_end_t *io_end = iocb->private;
>  
> -	/* if not async direct IO just return */
> -	if (!io_end) {
> -		inode_dio_done(inode);
> -		if (is_async)
> -			aio_complete(iocb, ret, 0);
> -		return;
> -	}
> +	/* if not async direct IO or dio with 0 bytes write, just return */
> +	if (!io_end || !size)
> +		goto out;
>  
>  	ext_debug("ext4_end_io_dio(): io_end 0x%p "
>  		  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
> @@ -3092,13 +3076,25 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
>  		  size);
>  
>  	iocb->private = NULL;
> +
> +	/* if not aio dio with unwritten extents, just free io and return */
> +	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
> +		ext4_free_io_end(io_end);
> +out:
> +		inode_dio_done(inode);
> +		if (is_async)
> +			aio_complete(iocb, ret, 0);
> +		return;
> +	}
> +
>  	io_end->offset = offset;
>  	io_end->size = size;
>  	if (is_async) {
>  		io_end->iocb = iocb;
>  		io_end->result = ret;
>  	}
> -	ext4_put_io_end_defer(io_end);
> +
> +	ext4_add_complete_io(io_end);
>  }
>  
>  /*
> @@ -3132,7 +3128,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
>  	get_block_t *get_block_func = NULL;
>  	int dio_flags = 0;
>  	loff_t final_size = offset + count;
> -	ext4_io_end_t *io_end = NULL;
>  
>  	/* Use the old path for reads and writes beyond i_size. */
>  	if (rw != WRITE || final_size > inode->i_size)
> @@ -3171,16 +3166,13 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
>  	iocb->private = NULL;
>  	ext4_inode_aio_set(inode, NULL);
>  	if (!is_sync_kiocb(iocb)) {
> -		io_end = ext4_init_io_end(inode, GFP_NOFS);
> +		ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
>  		if (!io_end) {
>  			ret = -ENOMEM;
>  			goto retake_lock;
>  		}
>  		io_end->flag |= EXT4_IO_END_DIRECT;
> -		/*
> -		 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
> -		 */
> -		iocb->private = ext4_get_io_end(io_end);
> +		iocb->private = io_end;
>  		/*
>  		 * we save the io structure for current async direct
>  		 * IO, so that later ext4_map_blocks() could flag the
> @@ -3204,27 +3196,26 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
>  				   NULL,
>  				   dio_flags);
>  
> +	if (iocb->private)
> +		ext4_inode_aio_set(inode, NULL);
>  	/*
> -	 * Put our reference to io_end. This can free the io_end structure e.g.
> -	 * in sync IO case or in case of error. It can even perform extent
> -	 * conversion if all bios we submitted finished before we got here.
> -	 * Note that in that case iocb->private can be already set to NULL
> -	 * here.
> +	 * The io_end structure takes a reference to the inode, that
> +	 * structure needs to be destroyed and the reference to the
> +	 * inode need to be dropped, when IO is complete, even with 0
> +	 * byte write, or failed.
> +	 *
> +	 * In the successful AIO DIO case, the io_end structure will
> +	 * be destroyed and the reference to the inode will be dropped
> +	 * after the end_io call back function is called.
> +	 *
> +	 * In the case there is 0 byte write, or error case, since VFS
> +	 * direct IO won't invoke the end_io call back function, we
> +	 * need to free the end_io structure here.
>  	 */
> -	if (io_end) {
> -		ext4_inode_aio_set(inode, NULL);
> -		ext4_put_io_end(io_end);
> -		/*
> -		 * In case of error or no write ext4_end_io_dio() was not
> -		 * called so we have to put iocb's reference.
> -		 */
> -		if (ret <= 0 && ret != -EIOCBQUEUED) {
> -			WARN_ON(iocb->private != io_end);
> -			ext4_put_io_end(io_end);
> -			iocb->private = NULL;
> -		}
> -	}
> -	if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
> +	if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
> +		ext4_free_io_end(iocb->private);
> +		iocb->private = NULL;
> +	} else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
>  						EXT4_STATE_DIO_UNWRITTEN)) {
>  		int err;
>  		/*
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index 5929cd0..6626aba 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -61,28 +61,15 @@ void ext4_ioend_shutdown(struct inode *inode)
>  		cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
>  }
>  
> -static void ext4_release_io_end(ext4_io_end_t *io_end)
> +void ext4_free_io_end(ext4_io_end_t *io)
>  {
> -	BUG_ON(!list_empty(&io_end->list));
> -	BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
> -
> -	if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
> -		wake_up_all(ext4_ioend_wq(io_end->inode));
> -	if (io_end->flag & EXT4_IO_END_DIRECT)
> -		inode_dio_done(io_end->inode);
> -	if (io_end->iocb)
> -		aio_complete(io_end->iocb, io_end->result, 0);
> -	kmem_cache_free(io_end_cachep, io_end);
> -}
> -
> -static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
> -{
> -	struct inode *inode = io_end->inode;
> +	BUG_ON(!io);
> +	BUG_ON(!list_empty(&io->list));
> +	BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
>  
> -	io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
> -	/* Wake up anyone waiting on unwritten extent conversion */
> -	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
> -		wake_up_all(ext4_ioend_wq(inode));
> +	if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
> +		wake_up_all(ext4_ioend_wq(io->inode));
> +	kmem_cache_free(io_end_cachep, io);
>  }
>  
>  /* check a range of space and convert unwritten extents to written. */
> @@ -105,8 +92,13 @@ static int ext4_end_io(ext4_io_end_t *io)
>  			 "(inode %lu, offset %llu, size %zd, error %d)",
>  			 inode->i_ino, offset, size, ret);
>  	}
> -	ext4_clear_io_unwritten_flag(io);
> -	ext4_release_io_end(io);
> +	/* Wake up anyone waiting on unwritten extent conversion */
> +	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
> +		wake_up_all(ext4_ioend_wq(inode));
> +	if (io->flag & EXT4_IO_END_DIRECT)
> +		inode_dio_done(inode);
> +	if (io->iocb)
> +		aio_complete(io->iocb, io->result, 0);
>  	return ret;
>  }
>  
> @@ -137,7 +129,7 @@ static void dump_completed_IO(struct inode *inode)
>  }
>  
>  /* Add the io_end to per-inode completed end_io list. */
> -static void ext4_add_complete_io(ext4_io_end_t *io_end)
> +void ext4_add_complete_io(ext4_io_end_t *io_end)
>  {
>  	struct ext4_inode_info *ei = EXT4_I(io_end->inode);
>  	struct workqueue_struct *wq;
> @@ -174,6 +166,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
>  		err = ext4_end_io(io);
>  		if (unlikely(!ret && err))
>  			ret = err;
> +		io->flag &= ~EXT4_IO_END_UNWRITTEN;
> +		ext4_free_io_end(io);
>  	}
>  	return ret;
>  }
> @@ -205,43 +199,10 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
>  		atomic_inc(&EXT4_I(inode)->i_ioend_count);
>  		io->inode = inode;
>  		INIT_LIST_HEAD(&io->list);
> -		atomic_set(&io->count, 1);
>  	}
>  	return io;
>  }
>  
> -void ext4_put_io_end_defer(ext4_io_end_t *io_end)
> -{
> -	if (atomic_dec_and_test(&io_end->count)) {
> -		if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
> -			ext4_release_io_end(io_end);
> -			return;
> -		}
> -		ext4_add_complete_io(io_end);
> -	}
> -}
> -
> -int ext4_put_io_end(ext4_io_end_t *io_end)
> -{
> -	int err = 0;
> -
> -	if (atomic_dec_and_test(&io_end->count)) {
> -		if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
> -			err = ext4_convert_unwritten_extents(io_end->inode,
> -						io_end->offset, io_end->size);
> -			ext4_clear_io_unwritten_flag(io_end);
> -		}
> -		ext4_release_io_end(io_end);
> -	}
> -	return err;
> -}
> -
> -ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
> -{
> -	atomic_inc(&io_end->count);
> -	return io_end;
> -}
> -
>  /*
>   * Print an buffer I/O error compatible with the fs/buffer.c.  This
>   * provides compatibility with dmesg scrapers that look for a specific
> @@ -324,7 +285,12 @@ static void ext4_end_bio(struct bio *bio, int error)
>  			     bi_sector >> (inode->i_blkbits - 9));
>  	}
>  
> -	ext4_put_io_end_defer(io_end);
> +	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
> +		ext4_free_io_end(io_end);
> +		return;
> +	}
> +
> +	ext4_add_complete_io(io_end);
>  }
>  
>  void ext4_io_submit(struct ext4_io_submit *io)
> @@ -338,37 +304,40 @@ void ext4_io_submit(struct ext4_io_submit *io)
>  		bio_put(io->io_bio);
>  	}
>  	io->io_bio = NULL;
> -}
> -
> -void ext4_io_submit_init(struct ext4_io_submit *io,
> -			 struct writeback_control *wbc)
> -{
> -	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
> -	io->io_bio = NULL;
> +	io->io_op = 0;
>  	io->io_end = NULL;
>  }
>  
> -static int io_submit_init_bio(struct ext4_io_submit *io,
> -			      struct buffer_head *bh)
> +static int io_submit_init(struct ext4_io_submit *io,
> +			  struct inode *inode,
> +			  struct writeback_control *wbc,
> +			  struct buffer_head *bh)
>  {
> +	ext4_io_end_t *io_end;
> +	struct page *page = bh->b_page;
>  	int nvecs = bio_get_nr_vecs(bh->b_bdev);
>  	struct bio *bio;
>  
> +	io_end = ext4_init_io_end(inode, GFP_NOFS);
> +	if (!io_end)
> +		return -ENOMEM;
>  	bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
>  	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
>  	bio->bi_bdev = bh->b_bdev;
> +	bio->bi_private = io->io_end = io_end;
>  	bio->bi_end_io = ext4_end_bio;
> -	bio->bi_private = ext4_get_io_end(io->io_end);
> -	if (!io->io_end->size)
> -		io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT)
> -				     + bh_offset(bh);
> +
> +	io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
> +
>  	io->io_bio = bio;
> +	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
>  	io->io_next_block = bh->b_blocknr;
>  	return 0;
>  }
>  
>  static int io_submit_add_bh(struct ext4_io_submit *io,
>  			    struct inode *inode,
> +			    struct writeback_control *wbc,
>  			    struct buffer_head *bh)
>  {
>  	ext4_io_end_t *io_end;
> @@ -379,18 +348,18 @@ submit_and_retry:
>  		ext4_io_submit(io);
>  	}
>  	if (io->io_bio == NULL) {
> -		ret = io_submit_init_bio(io, bh);
> +		ret = io_submit_init(io, inode, wbc, bh);
>  		if (ret)
>  			return ret;
>  	}
> -	ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
> -	if (ret != bh->b_size)
> -		goto submit_and_retry;
>  	io_end = io->io_end;
>  	if (test_clear_buffer_uninit(bh))
>  		ext4_set_io_unwritten_flag(inode, io_end);
> -	io_end->size += bh->b_size;
> +	io->io_end->size += bh->b_size;
>  	io->io_next_block++;
> +	ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
> +	if (ret != bh->b_size)
> +		goto submit_and_retry;
>  	return 0;
>  }
>  
> @@ -462,7 +431,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
>  	do {
>  		if (!buffer_async_write(bh))
>  			continue;
> -		ret = io_submit_add_bh(io, inode, bh);
> +		ret = io_submit_add_bh(io, inode, wbc, bh);
>  		if (ret) {
>  			/*
>  			 * We only get here on ENOMEM.  Not much else
> -- 
> 1.7.12.rc0.22.gcdd159b
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Sandeen - May 13, 2013, 4:34 p.m.
On 5/12/13 4:01 AM, Dmitry Monakhov wrote:
> In fact '4eec70' are vexing because I have reviewed and tested this patch before
> it was marked as Review-by, but missed the bug. This is because xfstests
> was executed manually logs was full of warnings but tainted flag was not
> checked at the end. 

Can you elaborate on this?  What was logged, and is it something we could
try to pick up post-test in xfstests?

Thanks,
-Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara - May 13, 2013, 5:01 p.m.
On Mon 13-05-13 11:34:12, Eric Sandeen wrote:
> On 5/12/13 4:01 AM, Dmitry Monakhov wrote:
> > In fact '4eec70' are vexing because I have reviewed and tested this patch before
> > it was marked as Review-by, but missed the bug. This is because xfstests
> > was executed manually logs was full of warnings but tainted flag was not
> > checked at the end. 
> 
> Can you elaborate on this?  What was logged, and is it something we could
> try to pick up post-test in xfstests?
  Generally I think it might be useful if xfstests would fail / warn if
kernel became tainted during the test (e.g. due to WARN_ON or oops, or
something like that). It should be even relatively easy to implement
(just compare /proc/sys/kernel/tainted before and after each test).

								Honza
Eric Sandeen - May 13, 2013, 5:09 p.m.
On 5/13/13 12:01 PM, Jan Kara wrote:
> On Mon 13-05-13 11:34:12, Eric Sandeen wrote:
>> On 5/12/13 4:01 AM, Dmitry Monakhov wrote:
>>> In fact '4eec70' are vexing because I have reviewed and tested this patch before
>>> it was marked as Review-by, but missed the bug. This is because xfstests
>>> was executed manually logs was full of warnings but tainted flag was not
>>> checked at the end. 
>>
>> Can you elaborate on this?  What was logged, and is it something we could
>> try to pick up post-test in xfstests?
>   Generally I think it might be useful if xfstests would fail / warn if
> kernel became tainted during the test (e.g. due to WARN_ON or oops, or
> something like that). It should be even relatively easy to implement
> (just compare /proc/sys/kernel/tainted before and after each test).
> 
> 								Honza
> 

Ah, right.  That should be easy, I'll see if I can cook that up.

Thanks,
-Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dmitri Monakho - May 14, 2013, 7:11 a.m.
On Mon, 13 May 2013 12:09:22 -0500, Eric Sandeen <sandeen@redhat.com> wrote:
> On 5/13/13 12:01 PM, Jan Kara wrote:
> > On Mon 13-05-13 11:34:12, Eric Sandeen wrote:
> >> On 5/12/13 4:01 AM, Dmitry Monakhov wrote:
> >>> In fact '4eec70' are vexing because I have reviewed and tested this patch before
> >>> it was marked as Review-by, but missed the bug. This is because xfstests
> >>> was executed manually logs was full of warnings but tainted flag was not
> >>> checked at the end. 
> >>
> >> Can you elaborate on this?  What was logged, and is it something we could
> >> try to pick up post-test in xfstests?
> >   Generally I think it might be useful if xfstests would fail / warn if
> > kernel became tainted during the test (e.g. due to WARN_ON or oops, or
> > something like that). It should be even relatively easy to implement
> > (just compare /proc/sys/kernel/tainted before and after each test).
> > 
> > 								Honza
> > 
> 
> Ah, right.  That should be easy, I'll see if I can cook that up.
Also we can use abrt's kernel-oops handler to collect messages.
> 
> Thanks,
> -Eric
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
 
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Sandeen - May 14, 2013, 2:08 p.m.
On 5/14/13 2:11 AM, Dmitry Monakhov wrote:
> On Mon, 13 May 2013 12:09:22 -0500, Eric Sandeen <sandeen@redhat.com> wrote:
>> On 5/13/13 12:01 PM, Jan Kara wrote:
>>> On Mon 13-05-13 11:34:12, Eric Sandeen wrote:
>>>> On 5/12/13 4:01 AM, Dmitry Monakhov wrote:
>>>>> In fact '4eec70' are vexing because I have reviewed and tested this patch before
>>>>> it was marked as Review-by, but missed the bug. This is because xfstests
>>>>> was executed manually logs was full of warnings but tainted flag was not
>>>>> checked at the end. 
>>>>
>>>> Can you elaborate on this?  What was logged, and is it something we could
>>>> try to pick up post-test in xfstests?
>>>   Generally I think it might be useful if xfstests would fail / warn if
>>> kernel became tainted during the test (e.g. due to WARN_ON or oops, or
>>> something like that). It should be even relatively easy to implement
>>> (just compare /proc/sys/kernel/tainted before and after each test).
>>>
>>> 								Honza
>>>
>>
>> Ah, right.  That should be easy, I'll see if I can cook that up.
> Also we can use abrt's kernel-oops handler to collect messages.

I sent a pretty simple patch to just check the sysctl to the xfs list
yesterday.

-Eric

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara - May 14, 2013, 10:04 p.m.
On Sun 12-05-13 13:01:11, Dmitry Monakhov wrote:
> On Sat, 11 May 2013 19:05:59 -0400, Theodore Ts'o <tytso@mit.edu> wrote:
> > On Sat, May 11, 2013 at 03:00:53PM +0400, Dmitry Monakhov wrote:
> > > I've bisected ext4 related issue. It is appeared that it is pure ext4
> > > specific. Regression caused by  following commit
> > > commit 4eec708d263f0ee10861d69251708a225b64cac7
> > > Author: Jan Kara <jack@suse.cz>
> > > Date:   Thu Apr 11 23:56:53 2013 -0400
> > >     ext4: use io_end for multiple bios
> > 
> > Hmm... the next question is why did I see this in my testing.  Did you
> > find this on ia64, or x86?
> No simple x86_64. 
> > Also what about the slab corruption which
> > you saw when running XFS; was that unrelated?
> My theory about mysterious corruption in mm layer which broke everything
> was wrong. We have to absolutely independent regressions in  different
> filesystems:
> * Slub corruption on XFS
>   - testcase: xfstests/generic/007
>   - bad commit: 666d64 
>   - LINK: https://lkml.org/lkml/2013/5/11/154
> 
> * Slub corruption on EXT4
>   - testcase: xfstests/generic/299
>   - bad commit: 4eec70
>   - link: https://lkml.org/lkml/2013/5/11/37
>   - fix: https://lkml.org/lkml/2013/5/11/142
> 
> In fact '4eec70' are vexing because I have reviewed and tested this patch
> before it was marked as Review-by, but missed the bug. This is because
> xfstests was executed manually logs was full of warnings but tainted flag
> was not checked at the end. To prevent this thing in future I'll always
> use my local autotest(http://autotest.github.io/) farm next time.
  OK, so I finally nailed this down. DIO code has that peculiarity that it
doesn't call ->end_io callback when no IO was actually submitted. As a
bonus the generic code does a cleanup of generic stuff that is otherwise left
to ->end_io callback. Since I need to do io_end cleanup in ->end_io callback
I have to compensate for that in ext4_ext_direct_IO(). Sadly I've got the
condition wrong and also forgot that generic code already decremented
i_dio_count in the error failure case so cleanup sometimes happened twice.
I'll add fixed version of the patch back in my series (since the patch got
reverted upstream).

								Honza

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0aabb34..5aae3d1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -209,7 +209,6 @@  typedef struct ext4_io_end {
 	ssize_t			size;		/* size of the extent */
 	struct kiocb		*iocb;		/* iocb struct for AIO */
 	int			result;		/* error value for AIO */
-	atomic_t		count;		/* reference counter */
 } ext4_io_end_t;
 
 struct ext4_io_submit {
@@ -2651,14 +2650,11 @@  extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
+extern void ext4_add_complete_io(ext4_io_end_t *io_end);
 extern void ext4_exit_pageio(void);
 extern void ext4_ioend_shutdown(struct inode *);
+extern void ext4_free_io_end(ext4_io_end_t *io);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
-extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
-extern int ext4_put_io_end(ext4_io_end_t *io_end);
-extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
-extern void ext4_io_submit_init(struct ext4_io_submit *io,
-				struct writeback_control *wbc);
 extern void ext4_end_io_work(struct work_struct *work);
 extern void ext4_io_submit(struct ext4_io_submit *io);
 extern int ext4_bio_write_page(struct ext4_io_submit *io,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 793d44b..d666569 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1487,10 +1487,7 @@  static int mpage_da_submit_io(struct mpage_da_data *mpd,
 	struct ext4_io_submit io_submit;
 
 	BUG_ON(mpd->next_page <= mpd->first_page);
-	ext4_io_submit_init(&io_submit, mpd->wbc);
-	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
-	if (!io_submit.io_end)
-		return -ENOMEM;
+	memset(&io_submit, 0, sizeof(io_submit));
 	/*
 	 * We need to start from the first_page to the next_page - 1
 	 * to make sure we also write the mapped dirty buffer_heads.
@@ -1578,8 +1575,6 @@  static int mpage_da_submit_io(struct mpage_da_data *mpd,
 		pagevec_release(&pvec);
 	}
 	ext4_io_submit(&io_submit);
-	/* Drop io_end reference we got from init */
-	ext4_put_io_end_defer(io_submit.io_end);
 	return ret;
 }
 
@@ -2238,16 +2233,9 @@  static int ext4_writepage(struct page *page,
 		 */
 		return __ext4_journalled_writepage(page, len);
 
-	ext4_io_submit_init(&io_submit, wbc);
-	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
-	if (!io_submit.io_end) {
-		redirty_page_for_writepage(wbc, page);
-		return -ENOMEM;
-	}
+	memset(&io_submit, 0, sizeof(io_submit));
 	ret = ext4_bio_write_page(&io_submit, page, len, wbc);
 	ext4_io_submit(&io_submit);
-	/* Drop io_end reference we got from init */
-	ext4_put_io_end_defer(io_submit.io_end);
 	return ret;
 }
 
@@ -3078,13 +3066,9 @@  static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 	struct inode *inode = file_inode(iocb->ki_filp);
         ext4_io_end_t *io_end = iocb->private;
 
-	/* if not async direct IO just return */
-	if (!io_end) {
-		inode_dio_done(inode);
-		if (is_async)
-			aio_complete(iocb, ret, 0);
-		return;
-	}
+	/* if not async direct IO or dio with 0 bytes write, just return */
+	if (!io_end || !size)
+		goto out;
 
 	ext_debug("ext4_end_io_dio(): io_end 0x%p "
 		  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3092,13 +3076,25 @@  static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 		  size);
 
 	iocb->private = NULL;
+
+	/* if not aio dio with unwritten extents, just free io and return */
+	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+		ext4_free_io_end(io_end);
+out:
+		inode_dio_done(inode);
+		if (is_async)
+			aio_complete(iocb, ret, 0);
+		return;
+	}
+
 	io_end->offset = offset;
 	io_end->size = size;
 	if (is_async) {
 		io_end->iocb = iocb;
 		io_end->result = ret;
 	}
-	ext4_put_io_end_defer(io_end);
+
+	ext4_add_complete_io(io_end);
 }
 
 /*
@@ -3132,7 +3128,6 @@  static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 	get_block_t *get_block_func = NULL;
 	int dio_flags = 0;
 	loff_t final_size = offset + count;
-	ext4_io_end_t *io_end = NULL;
 
 	/* Use the old path for reads and writes beyond i_size. */
 	if (rw != WRITE || final_size > inode->i_size)
@@ -3171,16 +3166,13 @@  static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 	iocb->private = NULL;
 	ext4_inode_aio_set(inode, NULL);
 	if (!is_sync_kiocb(iocb)) {
-		io_end = ext4_init_io_end(inode, GFP_NOFS);
+		ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
 		if (!io_end) {
 			ret = -ENOMEM;
 			goto retake_lock;
 		}
 		io_end->flag |= EXT4_IO_END_DIRECT;
-		/*
-		 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
-		 */
-		iocb->private = ext4_get_io_end(io_end);
+		iocb->private = io_end;
 		/*
 		 * we save the io structure for current async direct
 		 * IO, so that later ext4_map_blocks() could flag the
@@ -3204,27 +3196,26 @@  static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 				   NULL,
 				   dio_flags);
 
+	if (iocb->private)
+		ext4_inode_aio_set(inode, NULL);
 	/*
-	 * Put our reference to io_end. This can free the io_end structure e.g.
-	 * in sync IO case or in case of error. It can even perform extent
-	 * conversion if all bios we submitted finished before we got here.
-	 * Note that in that case iocb->private can be already set to NULL
-	 * here.
+	 * The io_end structure takes a reference to the inode, that
+	 * structure needs to be destroyed and the reference to the
+	 * inode need to be dropped, when IO is complete, even with 0
+	 * byte write, or failed.
+	 *
+	 * In the successful AIO DIO case, the io_end structure will
+	 * be destroyed and the reference to the inode will be dropped
+	 * after the end_io call back function is called.
+	 *
+	 * In the case there is 0 byte write, or error case, since VFS
+	 * direct IO won't invoke the end_io call back function, we
+	 * need to free the end_io structure here.
 	 */
-	if (io_end) {
-		ext4_inode_aio_set(inode, NULL);
-		ext4_put_io_end(io_end);
-		/*
-		 * In case of error or no write ext4_end_io_dio() was not
-		 * called so we have to put iocb's reference.
-		 */
-		if (ret <= 0 && ret != -EIOCBQUEUED) {
-			WARN_ON(iocb->private != io_end);
-			ext4_put_io_end(io_end);
-			iocb->private = NULL;
-		}
-	}
-	if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+	if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+		ext4_free_io_end(iocb->private);
+		iocb->private = NULL;
+	} else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
 						EXT4_STATE_DIO_UNWRITTEN)) {
 		int err;
 		/*
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 5929cd0..6626aba 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -61,28 +61,15 @@  void ext4_ioend_shutdown(struct inode *inode)
 		cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
 }
 
-static void ext4_release_io_end(ext4_io_end_t *io_end)
+void ext4_free_io_end(ext4_io_end_t *io)
 {
-	BUG_ON(!list_empty(&io_end->list));
-	BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
-
-	if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
-		wake_up_all(ext4_ioend_wq(io_end->inode));
-	if (io_end->flag & EXT4_IO_END_DIRECT)
-		inode_dio_done(io_end->inode);
-	if (io_end->iocb)
-		aio_complete(io_end->iocb, io_end->result, 0);
-	kmem_cache_free(io_end_cachep, io_end);
-}
-
-static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
-{
-	struct inode *inode = io_end->inode;
+	BUG_ON(!io);
+	BUG_ON(!list_empty(&io->list));
+	BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
 
-	io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
-	/* Wake up anyone waiting on unwritten extent conversion */
-	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
-		wake_up_all(ext4_ioend_wq(inode));
+	if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
+		wake_up_all(ext4_ioend_wq(io->inode));
+	kmem_cache_free(io_end_cachep, io);
 }
 
 /* check a range of space and convert unwritten extents to written. */
@@ -105,8 +92,13 @@  static int ext4_end_io(ext4_io_end_t *io)
 			 "(inode %lu, offset %llu, size %zd, error %d)",
 			 inode->i_ino, offset, size, ret);
 	}
-	ext4_clear_io_unwritten_flag(io);
-	ext4_release_io_end(io);
+	/* Wake up anyone waiting on unwritten extent conversion */
+	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+		wake_up_all(ext4_ioend_wq(inode));
+	if (io->flag & EXT4_IO_END_DIRECT)
+		inode_dio_done(inode);
+	if (io->iocb)
+		aio_complete(io->iocb, io->result, 0);
 	return ret;
 }
 
@@ -137,7 +129,7 @@  static void dump_completed_IO(struct inode *inode)
 }
 
 /* Add the io_end to per-inode completed end_io list. */
-static void ext4_add_complete_io(ext4_io_end_t *io_end)
+void ext4_add_complete_io(ext4_io_end_t *io_end)
 {
 	struct ext4_inode_info *ei = EXT4_I(io_end->inode);
 	struct workqueue_struct *wq;
@@ -174,6 +166,8 @@  static int ext4_do_flush_completed_IO(struct inode *inode)
 		err = ext4_end_io(io);
 		if (unlikely(!ret && err))
 			ret = err;
+		io->flag &= ~EXT4_IO_END_UNWRITTEN;
+		ext4_free_io_end(io);
 	}
 	return ret;
 }
@@ -205,43 +199,10 @@  ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
 		atomic_inc(&EXT4_I(inode)->i_ioend_count);
 		io->inode = inode;
 		INIT_LIST_HEAD(&io->list);
-		atomic_set(&io->count, 1);
 	}
 	return io;
 }
 
-void ext4_put_io_end_defer(ext4_io_end_t *io_end)
-{
-	if (atomic_dec_and_test(&io_end->count)) {
-		if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
-			ext4_release_io_end(io_end);
-			return;
-		}
-		ext4_add_complete_io(io_end);
-	}
-}
-
-int ext4_put_io_end(ext4_io_end_t *io_end)
-{
-	int err = 0;
-
-	if (atomic_dec_and_test(&io_end->count)) {
-		if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
-			err = ext4_convert_unwritten_extents(io_end->inode,
-						io_end->offset, io_end->size);
-			ext4_clear_io_unwritten_flag(io_end);
-		}
-		ext4_release_io_end(io_end);
-	}
-	return err;
-}
-
-ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
-{
-	atomic_inc(&io_end->count);
-	return io_end;
-}
-
 /*
  * Print an buffer I/O error compatible with the fs/buffer.c.  This
  * provides compatibility with dmesg scrapers that look for a specific
@@ -324,7 +285,12 @@  static void ext4_end_bio(struct bio *bio, int error)
 			     bi_sector >> (inode->i_blkbits - 9));
 	}
 
-	ext4_put_io_end_defer(io_end);
+	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+		ext4_free_io_end(io_end);
+		return;
+	}
+
+	ext4_add_complete_io(io_end);
 }
 
 void ext4_io_submit(struct ext4_io_submit *io)
@@ -338,37 +304,40 @@  void ext4_io_submit(struct ext4_io_submit *io)
 		bio_put(io->io_bio);
 	}
 	io->io_bio = NULL;
-}
-
-void ext4_io_submit_init(struct ext4_io_submit *io,
-			 struct writeback_control *wbc)
-{
-	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
-	io->io_bio = NULL;
+	io->io_op = 0;
 	io->io_end = NULL;
 }
 
-static int io_submit_init_bio(struct ext4_io_submit *io,
-			      struct buffer_head *bh)
+static int io_submit_init(struct ext4_io_submit *io,
+			  struct inode *inode,
+			  struct writeback_control *wbc,
+			  struct buffer_head *bh)
 {
+	ext4_io_end_t *io_end;
+	struct page *page = bh->b_page;
 	int nvecs = bio_get_nr_vecs(bh->b_bdev);
 	struct bio *bio;
 
+	io_end = ext4_init_io_end(inode, GFP_NOFS);
+	if (!io_end)
+		return -ENOMEM;
 	bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
 	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
+	bio->bi_private = io->io_end = io_end;
 	bio->bi_end_io = ext4_end_bio;
-	bio->bi_private = ext4_get_io_end(io->io_end);
-	if (!io->io_end->size)
-		io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT)
-				     + bh_offset(bh);
+
+	io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
+
 	io->io_bio = bio;
+	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
 	io->io_next_block = bh->b_blocknr;
 	return 0;
 }
 
 static int io_submit_add_bh(struct ext4_io_submit *io,
 			    struct inode *inode,
+			    struct writeback_control *wbc,
 			    struct buffer_head *bh)
 {
 	ext4_io_end_t *io_end;
@@ -379,18 +348,18 @@  submit_and_retry:
 		ext4_io_submit(io);
 	}
 	if (io->io_bio == NULL) {
-		ret = io_submit_init_bio(io, bh);
+		ret = io_submit_init(io, inode, wbc, bh);
 		if (ret)
 			return ret;
 	}
-	ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
-	if (ret != bh->b_size)
-		goto submit_and_retry;
 	io_end = io->io_end;
 	if (test_clear_buffer_uninit(bh))
 		ext4_set_io_unwritten_flag(inode, io_end);
-	io_end->size += bh->b_size;
+	io->io_end->size += bh->b_size;
 	io->io_next_block++;
+	ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
+	if (ret != bh->b_size)
+		goto submit_and_retry;
 	return 0;
 }
 
@@ -462,7 +431,7 @@  int ext4_bio_write_page(struct ext4_io_submit *io,
 	do {
 		if (!buffer_async_write(bh))
 			continue;
-		ret = io_submit_add_bh(io, inode, bh);
+		ret = io_submit_add_bh(io, inode, wbc, bh);
 		if (ret) {
 			/*
 			 * We only get here on ENOMEM.  Not much else