diff mbox

[22/29] ext4: Defer clearing of PageWriteback after extent conversion

Message ID 1365456754-29373-23-git-send-email-jack@suse.cz
State Superseded, archived
Headers show

Commit Message

Jan Kara April 8, 2013, 9:32 p.m. UTC
Currently PageWriteback bit gets cleared from put_io_page() called from
ext4_end_bio(). This is somewhat inconvenient as extent tree is not
fully updated at that time (unwritten extents are not marked as written)
so we cannot read the data back yet. This design was dictated by lock
ordering as we cannot start a transaction while PageWriteback bit is set
(we could easily deadlock with ext4_da_writepages()). But now that we
use transaction reservation for extent conversion, locking issues are
solved and we can move PageWriteback bit clearing after extent
conversion is done. As a result we can remove wait for unwritt en extent
conversion from ext4_sync_file() because it already implicitely happe ns
through wait_on_page_writeback().

We implement deferring of PageWriteback clearing by queueing completed
bios to appropriate io_end and processing all the pages when io_end is
going to be freed instead of at the moment ext4_io_end() is called.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/ext4.h    |    2 +
 fs/ext4/fsync.c   |    4 --
 fs/ext4/page-io.c |  132 +++++++++++++++++++++++++++++------------------------
 3 files changed, 74 insertions(+), 64 deletions(-)

Comments

Zheng Liu May 8, 2013, 7:08 a.m. UTC | #1
On Mon, Apr 08, 2013 at 11:32:27PM +0200, Jan Kara wrote:
> Currently PageWriteback bit gets cleared from put_io_page() called from
> ext4_end_bio(). This is somewhat inconvenient as extent tree is not
> fully updated at that time (unwritten extents are not marked as written)
> so we cannot read the data back yet. This design was dictated by lock
> ordering as we cannot start a transaction while PageWriteback bit is set
> (we could easily deadlock with ext4_da_writepages()). But now that we
> use transaction reservation for extent conversion, locking issues are
> solved and we can move PageWriteback bit clearing after extent
> conversion is done. As a result we can remove wait for unwritt en extent
> conversion from ext4_sync_file() because it already implicitely happe ns
> through wait_on_page_writeback().
> 
> We implement deferring of PageWriteback clearing by queueing completed
> bios to appropriate io_end and processing all the pages when io_end is
> going to be freed instead of at the moment ext4_io_end() is called.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>

Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
Regards,
                                                - Zheng

> ---
>  fs/ext4/ext4.h    |    2 +
>  fs/ext4/fsync.c   |    4 --
>  fs/ext4/page-io.c |  132 +++++++++++++++++++++++++++++------------------------
>  3 files changed, 74 insertions(+), 64 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index a594a94..2b0dd9a 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -190,6 +190,8 @@ typedef struct ext4_io_end {
>  	handle_t		*handle;	/* handle reserved for extent
>  						 * conversion */
>  	struct inode		*inode;		/* file being written to */
> +	struct bio		*bio;		/* Linked list of completed
> +						 * bios covering the extent */
>  	unsigned int		flag;		/* unwritten or not */
>  	loff_t			offset;		/* offset in the file */
>  	ssize_t			size;		/* size of the extent */
> diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
> index 3278e64..e02ba1b 100644
> --- a/fs/ext4/fsync.c
> +++ b/fs/ext4/fsync.c
> @@ -132,10 +132,6 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
>  	if (inode->i_sb->s_flags & MS_RDONLY)
>  		goto out;
>  
> -	ret = ext4_flush_unwritten_io(inode);
> -	if (ret < 0)
> -		goto out;
> -
>  	if (!journal) {
>  		ret = __sync_inode(inode, datasync);
>  		if (!ret && !hlist_empty(&inode->i_dentry))
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index 8bff3b3..2967794 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -51,14 +51,83 @@ void ext4_ioend_wait(struct inode *inode)
>  	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
>  }
>  
> +/*
> + * Print an buffer I/O error compatible with the fs/buffer.c.  This
> + * provides compatibility with dmesg scrapers that look for a specific
> + * buffer I/O error message.  We really need a unified error reporting
> + * structure to userspace ala Digital Unix's uerf system, but it's
> + * probably not going to happen in my lifetime, due to LKML politics...
> + */
> +static void buffer_io_error(struct buffer_head *bh)
> +{
> +	char b[BDEVNAME_SIZE];
> +	printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
> +			bdevname(bh->b_bdev, b),
> +			(unsigned long long)bh->b_blocknr);
> +}
> +
> +static void ext4_finish_bio(struct bio *bio)
> +{
> +	int i;
> +	int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
> +
> +	for (i = 0; i < bio->bi_vcnt; i++) {
> +		struct bio_vec *bvec = &bio->bi_io_vec[i];
> +		struct page *page = bvec->bv_page;
> +		struct buffer_head *bh, *head;
> +		unsigned bio_start = bvec->bv_offset;
> +		unsigned bio_end = bio_start + bvec->bv_len;
> +		unsigned under_io = 0;
> +		unsigned long flags;
> +
> +		if (!page)
> +			continue;
> +
> +		if (error) {
> +			SetPageError(page);
> +			set_bit(AS_EIO, &page->mapping->flags);
> +		}
> +		bh = head = page_buffers(page);
> +		/*
> +		 * We check all buffers in the page under BH_Uptodate_Lock
> +		 * to avoid races with other end io clearing async_write flags
> +		 */
> +		local_irq_save(flags);
> +		bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
> +		do {
> +			if (bh_offset(bh) < bio_start ||
> +			    bh_offset(bh) + bh->b_size > bio_end) {
> +				if (buffer_async_write(bh))
> +					under_io++;
> +				continue;
> +			}
> +			clear_buffer_async_write(bh);
> +			if (error)
> +				buffer_io_error(bh);
> +		} while ((bh = bh->b_this_page) != head);
> +		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
> +		local_irq_restore(flags);
> +		if (!under_io)
> +			end_page_writeback(page);
> +	}
> +}
> +
>  static void ext4_release_io_end(ext4_io_end_t *io_end)
>  {
> +	struct bio *bio, *next_bio;
> +
>  	BUG_ON(!list_empty(&io_end->list));
>  	BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
>  	WARN_ON(io_end->handle);
>  
>  	if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
>  		wake_up_all(ext4_ioend_wq(io_end->inode));
> +
> +	for (bio = io_end->bio; bio; bio = next_bio) {
> +		next_bio = bio->bi_private;
> +		ext4_finish_bio(bio);
> +		bio_put(bio);
> +	}
>  	if (io_end->flag & EXT4_IO_END_DIRECT)
>  		inode_dio_done(io_end->inode);
>  	if (io_end->iocb)
> @@ -254,76 +323,20 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
>  	return io_end;
>  }
>  
> -/*
> - * Print an buffer I/O error compatible with the fs/buffer.c.  This
> - * provides compatibility with dmesg scrapers that look for a specific
> - * buffer I/O error message.  We really need a unified error reporting
> - * structure to userspace ala Digital Unix's uerf system, but it's
> - * probably not going to happen in my lifetime, due to LKML politics...
> - */
> -static void buffer_io_error(struct buffer_head *bh)
> -{
> -	char b[BDEVNAME_SIZE];
> -	printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
> -			bdevname(bh->b_bdev, b),
> -			(unsigned long long)bh->b_blocknr);
> -}
> -
>  static void ext4_end_bio(struct bio *bio, int error)
>  {
>  	ext4_io_end_t *io_end = bio->bi_private;
>  	struct inode *inode;
> -	int i;
> -	int blocksize;
>  	sector_t bi_sector = bio->bi_sector;
>  
>  	BUG_ON(!io_end);
>  	inode = io_end->inode;
> -	blocksize = 1 << inode->i_blkbits;
> -	bio->bi_private = NULL;
>  	bio->bi_end_io = NULL;
> +	/* Link bio into list hanging from io_end */
> +	bio->bi_private = io_end->bio;
> +	io_end->bio = bio;
>  	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
>  		error = 0;
> -	for (i = 0; i < bio->bi_vcnt; i++) {
> -		struct bio_vec *bvec = &bio->bi_io_vec[i];
> -		struct page *page = bvec->bv_page;
> -		struct buffer_head *bh, *head;
> -		unsigned bio_start = bvec->bv_offset;
> -		unsigned bio_end = bio_start + bvec->bv_len;
> -		unsigned under_io = 0;
> -		unsigned long flags;
> -
> -		if (!page)
> -			continue;
> -
> -		if (error) {
> -			SetPageError(page);
> -			set_bit(AS_EIO, &page->mapping->flags);
> -		}
> -		bh = head = page_buffers(page);
> -		/*
> -		 * We check all buffers in the page under BH_Uptodate_Lock
> -		 * to avoid races with other end io clearing async_write flags
> -		 */
> -		local_irq_save(flags);
> -		bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
> -		do {
> -			if (bh_offset(bh) < bio_start ||
> -			    bh_offset(bh) + blocksize > bio_end) {
> -				if (buffer_async_write(bh))
> -					under_io++;
> -				continue;
> -			}
> -			clear_buffer_async_write(bh);
> -			if (error)
> -				buffer_io_error(bh);
> -		} while ((bh = bh->b_this_page) != head);
> -		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
> -		local_irq_restore(flags);
> -		if (!under_io)
> -			end_page_writeback(page);
> -	}
> -	bio_put(bio);
>  
>  	if (error) {
>  		io_end->flag |= EXT4_IO_END_ERROR;
> @@ -335,7 +348,6 @@ static void ext4_end_bio(struct bio *bio, int error)
>  			     (unsigned long long)
>  			     bi_sector >> (inode->i_blkbits - 9));
>  	}
> -
>  	ext4_put_io_end_defer(io_end);
>  }
>  
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a594a94..2b0dd9a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -190,6 +190,8 @@  typedef struct ext4_io_end {
 	handle_t		*handle;	/* handle reserved for extent
 						 * conversion */
 	struct inode		*inode;		/* file being written to */
+	struct bio		*bio;		/* Linked list of completed
+						 * bios covering the extent */
 	unsigned int		flag;		/* unwritten or not */
 	loff_t			offset;		/* offset in the file */
 	ssize_t			size;		/* size of the extent */
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 3278e64..e02ba1b 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -132,10 +132,6 @@  int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	if (inode->i_sb->s_flags & MS_RDONLY)
 		goto out;
 
-	ret = ext4_flush_unwritten_io(inode);
-	if (ret < 0)
-		goto out;
-
 	if (!journal) {
 		ret = __sync_inode(inode, datasync);
 		if (!ret && !hlist_empty(&inode->i_dentry))
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 8bff3b3..2967794 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -51,14 +51,83 @@  void ext4_ioend_wait(struct inode *inode)
 	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
 }
 
+/*
+ * Print an buffer I/O error compatible with the fs/buffer.c.  This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message.  We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
+ */
+static void buffer_io_error(struct buffer_head *bh)
+{
+	char b[BDEVNAME_SIZE];
+	printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+			bdevname(bh->b_bdev, b),
+			(unsigned long long)bh->b_blocknr);
+}
+
+static void ext4_finish_bio(struct bio *bio)
+{
+	int i;
+	int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+
+	for (i = 0; i < bio->bi_vcnt; i++) {
+		struct bio_vec *bvec = &bio->bi_io_vec[i];
+		struct page *page = bvec->bv_page;
+		struct buffer_head *bh, *head;
+		unsigned bio_start = bvec->bv_offset;
+		unsigned bio_end = bio_start + bvec->bv_len;
+		unsigned under_io = 0;
+		unsigned long flags;
+
+		if (!page)
+			continue;
+
+		if (error) {
+			SetPageError(page);
+			set_bit(AS_EIO, &page->mapping->flags);
+		}
+		bh = head = page_buffers(page);
+		/*
+		 * We check all buffers in the page under BH_Uptodate_Lock
+		 * to avoid races with other end io clearing async_write flags
+		 */
+		local_irq_save(flags);
+		bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+		do {
+			if (bh_offset(bh) < bio_start ||
+			    bh_offset(bh) + bh->b_size > bio_end) {
+				if (buffer_async_write(bh))
+					under_io++;
+				continue;
+			}
+			clear_buffer_async_write(bh);
+			if (error)
+				buffer_io_error(bh);
+		} while ((bh = bh->b_this_page) != head);
+		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+		local_irq_restore(flags);
+		if (!under_io)
+			end_page_writeback(page);
+	}
+}
+
 static void ext4_release_io_end(ext4_io_end_t *io_end)
 {
+	struct bio *bio, *next_bio;
+
 	BUG_ON(!list_empty(&io_end->list));
 	BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
 	WARN_ON(io_end->handle);
 
 	if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
 		wake_up_all(ext4_ioend_wq(io_end->inode));
+
+	for (bio = io_end->bio; bio; bio = next_bio) {
+		next_bio = bio->bi_private;
+		ext4_finish_bio(bio);
+		bio_put(bio);
+	}
 	if (io_end->flag & EXT4_IO_END_DIRECT)
 		inode_dio_done(io_end->inode);
 	if (io_end->iocb)
@@ -254,76 +323,20 @@  ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
 	return io_end;
 }
 
-/*
- * Print an buffer I/O error compatible with the fs/buffer.c.  This
- * provides compatibility with dmesg scrapers that look for a specific
- * buffer I/O error message.  We really need a unified error reporting
- * structure to userspace ala Digital Unix's uerf system, but it's
- * probably not going to happen in my lifetime, due to LKML politics...
- */
-static void buffer_io_error(struct buffer_head *bh)
-{
-	char b[BDEVNAME_SIZE];
-	printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
-			bdevname(bh->b_bdev, b),
-			(unsigned long long)bh->b_blocknr);
-}
-
 static void ext4_end_bio(struct bio *bio, int error)
 {
 	ext4_io_end_t *io_end = bio->bi_private;
 	struct inode *inode;
-	int i;
-	int blocksize;
 	sector_t bi_sector = bio->bi_sector;
 
 	BUG_ON(!io_end);
 	inode = io_end->inode;
-	blocksize = 1 << inode->i_blkbits;
-	bio->bi_private = NULL;
 	bio->bi_end_io = NULL;
+	/* Link bio into list hanging from io_end */
+	bio->bi_private = io_end->bio;
+	io_end->bio = bio;
 	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = 0;
-	for (i = 0; i < bio->bi_vcnt; i++) {
-		struct bio_vec *bvec = &bio->bi_io_vec[i];
-		struct page *page = bvec->bv_page;
-		struct buffer_head *bh, *head;
-		unsigned bio_start = bvec->bv_offset;
-		unsigned bio_end = bio_start + bvec->bv_len;
-		unsigned under_io = 0;
-		unsigned long flags;
-
-		if (!page)
-			continue;
-
-		if (error) {
-			SetPageError(page);
-			set_bit(AS_EIO, &page->mapping->flags);
-		}
-		bh = head = page_buffers(page);
-		/*
-		 * We check all buffers in the page under BH_Uptodate_Lock
-		 * to avoid races with other end io clearing async_write flags
-		 */
-		local_irq_save(flags);
-		bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
-		do {
-			if (bh_offset(bh) < bio_start ||
-			    bh_offset(bh) + blocksize > bio_end) {
-				if (buffer_async_write(bh))
-					under_io++;
-				continue;
-			}
-			clear_buffer_async_write(bh);
-			if (error)
-				buffer_io_error(bh);
-		} while ((bh = bh->b_this_page) != head);
-		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
-		local_irq_restore(flags);
-		if (!under_io)
-			end_page_writeback(page);
-	}
-	bio_put(bio);
 
 	if (error) {
 		io_end->flag |= EXT4_IO_END_ERROR;
@@ -335,7 +348,6 @@  static void ext4_end_bio(struct bio *bio, int error)
 			     (unsigned long long)
 			     bi_sector >> (inode->i_blkbits - 9));
 	}
-
 	ext4_put_io_end_defer(io_end);
 }