Patchwork [09/10,v5] ext4: convert unwritten extents from extent status tree in end_io

login
register
mail settings
Submitter Zheng Liu
Date Feb. 8, 2013, 8:44 a.m.
Message ID <1360313046-9876-10-git-send-email-wenqing.lz@taobao.com>
Download mbox | patch
Permalink /patch/219065/
State Superseded
Headers show

Comments

Zheng Liu - Feb. 8, 2013, 8:44 a.m.
From: Zheng Liu <wenqing.lz@taobao.com>

This commit tries to convert unwritten extents from extent status tree
in end_io callback functions and ext4_ext_direct_IO.

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Jan kara <jack@suse.cz>
---
 fs/ext4/extents.c           |   6 +-
 fs/ext4/extents_status.c    | 180 ++++++++++++++++++++++++++++++++++++++++----
 fs/ext4/extents_status.h    |   2 +
 fs/ext4/inode.c             |   5 ++
 fs/ext4/page-io.c           |   8 +-
 include/trace/events/ext4.h |  25 ++++++
 6 files changed, 208 insertions(+), 18 deletions(-)
Zheng Liu - Feb. 10, 2013, 8:45 a.m.
On Fri, Feb 08, 2013 at 04:44:05PM +0800, Zheng Liu wrote:
> From: Zheng Liu <wenqing.lz@taobao.com>
> 
> This commit tries to convert unwritten extents from extent status tree
> in end_io callback functions and ext4_ext_direct_IO.
> 
> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> Cc: "Theodore Ts'o" <tytso@mit.edu>
> Cc: Jan kara <jack@suse.cz>
> ---
>  fs/ext4/extents.c           |   6 +-
>  fs/ext4/extents_status.c    | 180 ++++++++++++++++++++++++++++++++++++++++----
>  fs/ext4/extents_status.h    |   2 +
>  fs/ext4/inode.c             |   5 ++
>  fs/ext4/page-io.c           |   8 +-
>  include/trace/events/ext4.h |  25 ++++++
>  6 files changed, 208 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 9f21430..a03cabf 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4443,8 +4443,10 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
>  			ret = PTR_ERR(handle);
>  			break;
>  		}
> -		ret = ext4_map_blocks(handle, inode, &map,
> -				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
> +		down_write(&EXT4_I(inode)->i_data_sem);
> +		ret = ext4_ext_map_blocks(handle, inode, &map,
> +					  EXT4_GET_BLOCKS_IO_CONVERT_EXT);
> +		up_write(&EXT4_I(inode)->i_data_sem);
>  		if (ret <= 0) {
>  			WARN_ON(ret <= 0);
>  			ext4_msg(inode->i_sb, KERN_ERR,

Hi Ted,

This chunk is missing in your 'dev' branch of ext4.  If we call
ext4_map_blocks here, unwritten extent in disk will never be converted
because ext4_map_blocks first tries to lookup extent status tree and the
unwriten extent in this tree has been converted in end_io callback
function.  It always looks up a written extent in cache, and return
immdiately.  Please check it again.

Thanks,
                                                - Zheng

> diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
> index bac5286..eab8893 100644
> --- a/fs/ext4/extents_status.c
> +++ b/fs/ext4/extents_status.c
> @@ -248,10 +248,11 @@ ext4_lblk_t ext4_es_find_delayed_extent(struct inode *inode,
>  	struct extent_status *es1 = NULL;
>  	struct rb_node *node;
>  	ext4_lblk_t ret = EXT_MAX_BLOCKS;
> +	unsigned long flags;
>  
>  	trace_ext4_es_find_delayed_extent_enter(inode, es->es_lblk);
>  
> -	read_lock(&EXT4_I(inode)->i_es_lock);
> +	read_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
>  	tree = &EXT4_I(inode)->i_es_tree;
>  
>  	/* find extent in cache firstly */
> @@ -291,7 +292,7 @@ out:
>  		}
>  	}
>  
> -	read_unlock(&EXT4_I(inode)->i_es_lock);
> +	read_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
>  
>  	ext4_es_lru_add(inode);
>  	trace_ext4_es_find_delayed_extent_exit(inode, es, ret);
> @@ -458,6 +459,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>  {
>  	struct extent_status newes;
>  	ext4_lblk_t end = lblk + len - 1;
> +	unsigned long flags;
>  	int err = 0;
>  
>  	es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n",
> @@ -471,14 +473,14 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>  	ext4_es_store_status(&newes, status);
>  	trace_ext4_es_insert_extent(inode, &newes);
>  
> -	write_lock(&EXT4_I(inode)->i_es_lock);
> +	write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
>  	err = __es_remove_extent(inode, lblk, end);
>  	if (err != 0)
>  		goto error;
>  	err = __es_insert_extent(inode, &newes);
>  
>  error:
> -	write_unlock(&EXT4_I(inode)->i_es_lock);
> +	write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
>  
>  	ext4_es_lru_add(inode);
>  	ext4_es_print_tree(inode);
> @@ -498,13 +500,14 @@ int ext4_es_lookup_extent(struct inode *inode, struct extent_status *es)
>  	struct ext4_es_tree *tree;
>  	struct extent_status *es1 = NULL;
>  	struct rb_node *node;
> +	unsigned long flags;
>  	int found = 0;
>  
>  	trace_ext4_es_lookup_extent_enter(inode, es->es_lblk);
>  	es_debug("lookup extent in block %u\n", es->es_lblk);
>  
>  	tree = &EXT4_I(inode)->i_es_tree;
> -	read_lock(&EXT4_I(inode)->i_es_lock);
> +	read_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
>  
>  	/* find extent in cache firstly */
>  	es->es_len = es->es_pblk = 0;
> @@ -539,7 +542,7 @@ out:
>  		es->es_pblk = es1->es_pblk;
>  	}
>  
> -	read_unlock(&EXT4_I(inode)->i_es_lock);
> +	read_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
>  
>  	ext4_es_lru_add(inode);
>  	trace_ext4_es_lookup_extent_exit(inode, es, found);
> @@ -649,6 +652,7 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
>  			  ext4_lblk_t len)
>  {
>  	ext4_lblk_t end;
> +	unsigned long flags;
>  	int err = 0;
>  
>  	trace_ext4_es_remove_extent(inode, lblk, len);
> @@ -658,9 +662,9 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
>  	end = lblk + len - 1;
>  	BUG_ON(end < lblk);
>  
> -	write_lock(&EXT4_I(inode)->i_es_lock);
> +	write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
>  	err = __es_remove_extent(inode, lblk, end);
> -	write_unlock(&EXT4_I(inode)->i_es_lock);
> +	write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
>  	ext4_es_print_tree(inode);
>  	return err;
>  }
> @@ -671,6 +675,7 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
>  					struct ext4_sb_info, s_es_shrinker);
>  	struct ext4_inode_info *ei;
>  	struct list_head *cur, *tmp, scanned;
> +	unsigned long flags;
>  	int nr_to_scan = sc->nr_to_scan;
>  	int ret, nr_shrunk = 0;
>  
> @@ -687,16 +692,16 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
>  
>  		ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
>  
> -		read_lock(&ei->i_es_lock);
> +		read_lock_irqsave(&ei->i_es_lock, flags);
>  		if (ei->i_es_lru_nr == 0) {
> -			read_unlock(&ei->i_es_lock);
> +			read_unlock_irqrestore(&ei->i_es_lock, flags);
>  			continue;
>  		}
> -		read_unlock(&ei->i_es_lock);
> +		read_unlock_irqrestore(&ei->i_es_lock, flags);
>  
> -		write_lock(&ei->i_es_lock);
> +		write_lock_irqsave(&ei->i_es_lock, flags);
>  		ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
> -		write_unlock(&ei->i_es_lock);
> +		write_unlock_irqrestore(&ei->i_es_lock, flags);
>  
>  		nr_shrunk += ret;
>  		nr_to_scan -= ret;
> @@ -756,14 +761,15 @@ static int ext4_es_reclaim_extents_count(struct super_block *sb)
>  	struct ext4_sb_info *sbi = EXT4_SB(sb);
>  	struct ext4_inode_info *ei;
>  	struct list_head *cur;
> +	unsigned long flags;
>  	int nr_cached = 0;
>  
>  	spin_lock(&sbi->s_es_lru_lock);
>  	list_for_each(cur, &sbi->s_es_lru) {
>  		ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
> -		read_lock(&ei->i_es_lock);
> +		read_lock_irqsave(&ei->i_es_lock, flags);
>  		nr_cached += ei->i_es_lru_nr;
> -		read_unlock(&ei->i_es_lock);
> +		read_unlock_irqrestore(&ei->i_es_lock, flags);
>  	}
>  	spin_unlock(&sbi->s_es_lru_lock);
>  	trace_ext4_es_reclaim_extents_count(sb, nr_cached);
> @@ -801,3 +807,147 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
>  	tree->cache_es = NULL;
>  	return nr_shrunk;
>  }
> +
> +int ext4_es_convert_unwritten_extents(struct inode *inode, loff_t offset,
> +				      size_t size)
> +{
> +	struct ext4_es_tree *tree;
> +	struct rb_node *node;
> +	struct extent_status *es, orig_es, conv_es;
> +	ext4_lblk_t end, len1, len2;
> +	ext4_lblk_t lblk = 0, len = 0;
> +	ext4_fsblk_t block;
> +	unsigned long flags;
> +	unsigned int blkbits;
> +	int err = 0;
> +
> +	trace_ext4_es_convert_unwritten_extents(inode, offset, size);
> +	blkbits = inode->i_blkbits;
> +	lblk = offset >> blkbits;
> +	len = (EXT4_BLOCK_ALIGN(offset + size, blkbits) >> blkbits) - lblk;
> +
> +	end = lblk + len - 1;
> +	BUG_ON(end < lblk);
> +
> +	tree = &EXT4_I(inode)->i_es_tree;
> +
> +	write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
> +	es = __es_tree_search(&tree->root, lblk);
> +	if (!es)
> +		goto out;
> +	if (es->es_lblk > end)
> +		goto out;
> +
> +	tree->cache_es = NULL;
> +
> +	orig_es.es_lblk = es->es_lblk;
> +	orig_es.es_len = es->es_len;
> +	orig_es.es_pblk = es->es_pblk;
> +
> +	len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
> +	len2 = ext4_es_end(es) > end ?
> +	       ext4_es_end(es) - end : 0;
> +	if (len1 > 0)
> +		es->es_len = len1;
> +	if (len2 > 0) {
> +		if (len1 > 0) {
> +			struct extent_status newes;
> +
> +			newes.es_lblk = end + 1;
> +			newes.es_len = len2;
> +			block = ext4_es_pblock(&orig_es) +
> +				orig_es.es_len - len2;
> +			ext4_es_store_pblock(&newes, block);
> +			ext4_es_store_status(&newes, ext4_es_status(&orig_es));
> +			err = __es_insert_extent(inode, &newes);
> +			if (err) {
> +				es->es_lblk = orig_es.es_lblk;
> +				es->es_len = orig_es.es_len;
> +				es->es_pblk = orig_es.es_pblk;
> +				goto out;
> +			}
> +
> +			conv_es.es_lblk = orig_es.es_lblk + len1;
> +			conv_es.es_len = orig_es.es_len - len1 - len2;
> +			block = ext4_es_pblock(&orig_es) + len1;
> +			ext4_es_store_pblock(&conv_es, block);
> +			ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
> +			err = __es_insert_extent(inode, &conv_es);
> +			if (err) {
> +				int err2 = __es_remove_extent(inode,
> +							conv_es.es_lblk,
> +							ext4_es_end(&newes));
> +				if (err2)
> +					goto out;
> +				es->es_lblk = orig_es.es_lblk;
> +				es->es_len = orig_es.es_len;
> +				es->es_pblk = orig_es.es_pblk;
> +				goto out;
> +			}
> +		} else {
> +			es->es_lblk = end + 1;
> +			es->es_len = len2;
> +			block = ext4_es_pblock(&orig_es) +
> +				orig_es.es_len - len2;
> +			ext4_es_store_pblock(es, block);
> +
> +			conv_es.es_lblk = orig_es.es_lblk;
> +			conv_es.es_len = orig_es.es_len - len2;
> +			ext4_es_store_pblock(&conv_es,
> +					     ext4_es_pblock(&orig_es));
> +			ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
> +			err = __es_insert_extent(inode, &conv_es);
> +			if (err) {
> +				es->es_lblk = orig_es.es_lblk;
> +				es->es_len = orig_es.es_len;
> +				es->es_pblk = orig_es.es_pblk;
> +			}
> +		}
> +		goto out;
> +	}
> +
> +	if (len1 > 0) {
> +		node = rb_next(&es->rb_node);
> +		if (node)
> +			es = rb_entry(node, struct extent_status, rb_node);
> +		else
> +			es = NULL;
> +	}
> +
> +	while (es && ext4_es_end(es) <= end) {
> +		node = rb_next(&es->rb_node);
> +		ext4_es_store_status(es, EXTENT_STATUS_WRITTEN);
> +		if (!inode) {
> +			es = NULL;
> +			break;
> +		}
> +		es = rb_entry(node, struct extent_status, rb_node);
> +	}
> +
> +	if (es && es->es_lblk < end + 1) {
> +		ext4_lblk_t orig_len = es->es_len;
> +
> +		/*
> +		 * Here we first set conv_es just because of avoiding copy the
> +		 * value of es to a temporary variable.
> +		 */
> +		len1 = ext4_es_end(es) - end;
> +		conv_es.es_lblk = es->es_lblk;
> +		conv_es.es_len = es->es_len - len1;
> +		ext4_es_store_pblock(&conv_es, ext4_es_pblock(es));
> +		ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
> +
> +		es->es_lblk = end + 1;
> +		es->es_len = len1;
> +		block = ext4_es_pblock(es) + orig_len - len1;
> +		ext4_es_store_pblock(es, block);
> +
> +		err = __es_insert_extent(inode, &conv_es);
> +		if (err)
> +			goto out;
> +	}
> +
> +out:
> +	write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
> +	return err;
> +}
> diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
> index 938ad2b..2849d74 100644
> --- a/fs/ext4/extents_status.h
> +++ b/fs/ext4/extents_status.h
> @@ -54,6 +54,8 @@ extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
>  extern ext4_lblk_t ext4_es_find_delayed_extent(struct inode *inode,
>  					       struct extent_status *es);
>  extern int ext4_es_lookup_extent(struct inode *inode, struct extent_status *es);
> +extern int ext4_es_convert_unwritten_extents(struct inode *inode, loff_t offset,
> +					     size_t size);
>  
>  static inline int ext4_es_is_written(struct extent_status *es)
>  {
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 670779a..08cf720 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3063,6 +3063,7 @@ out:
>  		io_end->result = ret;
>  	}
>  
> +	ext4_es_convert_unwritten_extents(inode, offset, size);
>  	ext4_add_complete_io(io_end);
>  }
>  
> @@ -3088,6 +3089,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
>  	 */
>  	inode = io_end->inode;
>  	ext4_set_io_unwritten_flag(inode, io_end);
> +	ext4_es_convert_unwritten_extents(inode, io_end->offset, io_end->size);
>  	ext4_add_complete_io(io_end);
>  out:
>  	bh->b_private = NULL;
> @@ -3246,6 +3248,9 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
>  	} else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
>  						EXT4_STATE_DIO_UNWRITTEN)) {
>  		int err;
> +		err = ext4_es_convert_unwritten_extents(inode, offset, ret);
> +		if (err)
> +			ret = err;
>  		/*
>  		 * for non AIO case, since the IO is already
>  		 * completed, we could do the conversion right here
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index 0016fbc..66ea30e 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -276,6 +276,13 @@ static void ext4_end_bio(struct bio *bio, int error)
>  		error = 0;
>  	bio_put(bio);
>  
> +	/*
> +	 * We need to convert unwrittne extents in extent status tree before
> +	 * end_page_writeback() is called.  Otherwise, when dioread_nolock is
> +	 * enabled, we will be likely to read stale data.
> +	 */
> +	inode = io_end->inode;
> +	ext4_es_convert_unwritten_extents(inode, io_end->offset, io_end->size);
>  	for (i = 0; i < io_end->num_io_pages; i++) {
>  		struct page *page = io_end->pages[i]->p_page;
>  		struct buffer_head *bh, *head;
> @@ -305,7 +312,6 @@ static void ext4_end_bio(struct bio *bio, int error)
>  		put_io_page(io_end->pages[i]);
>  	}
>  	io_end->num_io_pages = 0;
> -	inode = io_end->inode;
>  
>  	if (error) {
>  		io_end->flag |= EXT4_IO_END_ERROR;
> diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
> index f0734b3..d32e3d5 100644
> --- a/include/trace/events/ext4.h
> +++ b/include/trace/events/ext4.h
> @@ -2233,6 +2233,31 @@ TRACE_EVENT(ext4_es_lookup_extent_exit,
>  		  __entry->found ? __entry->status : 0)
>  );
>  
> +TRACE_EVENT(ext4_es_convert_unwritten_extents,
> +	TP_PROTO(struct inode *inode, loff_t offset, loff_t size),
> +
> +	TP_ARGS(inode, offset, size),
> +
> +	TP_STRUCT__entry(
> +		__field(	dev_t,	dev			)
> +		__field(	ino_t,	ino			)
> +		__field(	loff_t,	offset			)
> +		__field(	loff_t, size			)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dev	= inode->i_sb->s_dev;
> +		__entry->ino	= inode->i_ino;
> +		__entry->offset	= offset;
> +		__entry->size	= size;
> +	),
> +
> +	TP_printk("dev %d,%d ino %lu convert unwritten extents [%llu/%llu",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  (unsigned long) __entry->ino,
> +		  __entry->offset, __entry->size)
> +);
> +
>  TRACE_EVENT(ext4_es_reclaim_extents_count,
>  	TP_PROTO(struct super_block *sb, int nr_cached),
>  
> -- 
> 1.7.12.rc2.18.g61b472e
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o - Feb. 11, 2013, 1:52 a.m.
On Sun, Feb 10, 2013 at 04:45:11PM +0800, Zheng Liu wrote:
> 
> This chunk is missing in your 'dev' branch of ext4.  If we call
> ext4_map_blocks here, unwritten extent in disk will never be converted
> because ext4_map_blocks first tries to lookup extent status tree and the
> unwriten extent in this tree has been converted in end_io callback
> function.  It always looks up a written extent in cache, and return
> immdiately.  Please check it again.

Thanks for catching this!  I missed the the patch hunk getting
rejected because I had been distracted with the patch hunk rejection
caused by the fact that we've since dropped the function
ext4_end_io_buffer_write().

I've fixed this up and pushed an updated git "dev" branch.

     	   	       	      	 	 - Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara - Feb. 12, 2013, 12:51 p.m.
On Fri 08-02-13 16:44:05, Zheng Liu wrote:
> From: Zheng Liu <wenqing.lz@taobao.com>
> 
> This commit tries to convert unwritten extents from extent status tree
> in end_io callback functions and ext4_ext_direct_IO.
  Why should we do this?

...
> @@ -801,3 +807,147 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
>  	tree->cache_es = NULL;
>  	return nr_shrunk;
>  }
> +
> +int ext4_es_convert_unwritten_extents(struct inode *inode, loff_t offset,
> +				      size_t size)
> +{
> +	struct ext4_es_tree *tree;
> +	struct rb_node *node;
> +	struct extent_status *es, orig_es, conv_es;
> +	ext4_lblk_t end, len1, len2;
> +	ext4_lblk_t lblk = 0, len = 0;
> +	ext4_fsblk_t block;
> +	unsigned long flags;
> +	unsigned int blkbits;
> +	int err = 0;
> +
> +	trace_ext4_es_convert_unwritten_extents(inode, offset, size);
> +	blkbits = inode->i_blkbits;
> +	lblk = offset >> blkbits;
> +	len = (EXT4_BLOCK_ALIGN(offset + size, blkbits) >> blkbits) - lblk;
> +
> +	end = lblk + len - 1;
> +	BUG_ON(end < lblk);
> +
> +	tree = &EXT4_I(inode)->i_es_tree;
> +
> +	write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
> +	es = __es_tree_search(&tree->root, lblk);
> +	if (!es)
> +		goto out;
> +	if (es->es_lblk > end)
> +		goto out;
> +
> +	tree->cache_es = NULL;
> +
> +	orig_es.es_lblk = es->es_lblk;
> +	orig_es.es_len = es->es_len;
> +	orig_es.es_pblk = es->es_pblk;
> +
> +	len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
> +	len2 = ext4_es_end(es) > end ?
> +	       ext4_es_end(es) - end : 0;
> +	if (len1 > 0)
> +		es->es_len = len1;
> +	if (len2 > 0) {
> +		if (len1 > 0) {
> +			struct extent_status newes;
> +
> +			newes.es_lblk = end + 1;
> +			newes.es_len = len2;
> +			block = ext4_es_pblock(&orig_es) +
> +				orig_es.es_len - len2;
> +			ext4_es_store_pblock(&newes, block);
> +			ext4_es_store_status(&newes, ext4_es_status(&orig_es));
> +			err = __es_insert_extent(inode, &newes);
> +			if (err) {
> +				es->es_lblk = orig_es.es_lblk;
> +				es->es_len = orig_es.es_len;
> +				es->es_pblk = orig_es.es_pblk;
> +				goto out;
> +			}
> +
> +			conv_es.es_lblk = orig_es.es_lblk + len1;
> +			conv_es.es_len = orig_es.es_len - len1 - len2;
> +			block = ext4_es_pblock(&orig_es) + len1;
> +			ext4_es_store_pblock(&conv_es, block);
> +			ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
> +			err = __es_insert_extent(inode, &conv_es);
> +			if (err) {
> +				int err2 = __es_remove_extent(inode,
> +							conv_es.es_lblk,
> +							ext4_es_end(&newes));
> +				if (err2)
> +					goto out;
> +				es->es_lblk = orig_es.es_lblk;
> +				es->es_len = orig_es.es_len;
> +				es->es_pblk = orig_es.es_pblk;
> +				goto out;
> +			}
> +		} else {
> +			es->es_lblk = end + 1;
> +			es->es_len = len2;
> +			block = ext4_es_pblock(&orig_es) +
> +				orig_es.es_len - len2;
> +			ext4_es_store_pblock(es, block);
> +
> +			conv_es.es_lblk = orig_es.es_lblk;
> +			conv_es.es_len = orig_es.es_len - len2;
> +			ext4_es_store_pblock(&conv_es,
> +					     ext4_es_pblock(&orig_es));
> +			ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
> +			err = __es_insert_extent(inode, &conv_es);
> +			if (err) {
> +				es->es_lblk = orig_es.es_lblk;
> +				es->es_len = orig_es.es_len;
> +				es->es_pblk = orig_es.es_pblk;
> +			}
> +		}
> +		goto out;
> +	}
> +
> +	if (len1 > 0) {
> +		node = rb_next(&es->rb_node);
> +		if (node)
> +			es = rb_entry(node, struct extent_status, rb_node);
> +		else
> +			es = NULL;
> +	}
> +
> +	while (es && ext4_es_end(es) <= end) {
> +		node = rb_next(&es->rb_node);
> +		ext4_es_store_status(es, EXTENT_STATUS_WRITTEN);
> +		if (!inode) {
> +			es = NULL;
> +			break;
> +		}
> +		es = rb_entry(node, struct extent_status, rb_node);
> +	}
> +
> +	if (es && es->es_lblk < end + 1) {
> +		ext4_lblk_t orig_len = es->es_len;
> +
> +		/*
> +		 * Here we first set conv_es just because of avoiding copy the
> +		 * value of es to a temporary variable.
> +		 */
> +		len1 = ext4_es_end(es) - end;
> +		conv_es.es_lblk = es->es_lblk;
> +		conv_es.es_len = es->es_len - len1;
> +		ext4_es_store_pblock(&conv_es, ext4_es_pblock(es));
> +		ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
> +
> +		es->es_lblk = end + 1;
> +		es->es_len = len1;
> +		block = ext4_es_pblock(es) + orig_len - len1;
> +		ext4_es_store_pblock(es, block);
> +
> +		err = __es_insert_extent(inode, &conv_es);
> +		if (err)
> +			goto out;
> +	}
> +
> +out:
> +	write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
> +	return err;
> +}
  Is this really needed? Why don't you just use ext4_es_insert_extent() to
insert new extent of proper type? Also the way you wrote it, we can return
(freshly written) data to the user, then reclaim the extent status from
memory and later return 0s because we read the original status from disk
(conversion hasn't still happened on disk). That would be certainly
confusing.

									Honza
Zheng Liu - Feb. 15, 2013, 7:12 a.m.
On Tue, Feb 12, 2013 at 01:51:59PM +0100, Jan Kara wrote:
> On Fri 08-02-13 16:44:05, Zheng Liu wrote:
> > From: Zheng Liu <wenqing.lz@taobao.com>
> > 
> > This commit tries to convert unwritten extents from extent status tree
> > in end_io callback functions and ext4_ext_direct_IO.
>   Why should we do this?
> 
> ...
> > @@ -801,3 +807,147 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
> >  	tree->cache_es = NULL;
> >  	return nr_shrunk;
> >  }
> > +
> > +int ext4_es_convert_unwritten_extents(struct inode *inode, loff_t offset,
> > +				      size_t size)
> > +{
> > +	struct ext4_es_tree *tree;
> > +	struct rb_node *node;
> > +	struct extent_status *es, orig_es, conv_es;
> > +	ext4_lblk_t end, len1, len2;
> > +	ext4_lblk_t lblk = 0, len = 0;
> > +	ext4_fsblk_t block;
> > +	unsigned long flags;
> > +	unsigned int blkbits;
> > +	int err = 0;
> > +
> > +	trace_ext4_es_convert_unwritten_extents(inode, offset, size);
> > +	blkbits = inode->i_blkbits;
> > +	lblk = offset >> blkbits;
> > +	len = (EXT4_BLOCK_ALIGN(offset + size, blkbits) >> blkbits) - lblk;
> > +
> > +	end = lblk + len - 1;
> > +	BUG_ON(end < lblk);
> > +
> > +	tree = &EXT4_I(inode)->i_es_tree;
> > +
> > +	write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
> > +	es = __es_tree_search(&tree->root, lblk);
> > +	if (!es)
> > +		goto out;
> > +	if (es->es_lblk > end)
> > +		goto out;
> > +
> > +	tree->cache_es = NULL;
> > +
> > +	orig_es.es_lblk = es->es_lblk;
> > +	orig_es.es_len = es->es_len;
> > +	orig_es.es_pblk = es->es_pblk;
> > +
> > +	len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
> > +	len2 = ext4_es_end(es) > end ?
> > +	       ext4_es_end(es) - end : 0;
> > +	if (len1 > 0)
> > +		es->es_len = len1;
> > +	if (len2 > 0) {
> > +		if (len1 > 0) {
> > +			struct extent_status newes;
> > +
> > +			newes.es_lblk = end + 1;
> > +			newes.es_len = len2;
> > +			block = ext4_es_pblock(&orig_es) +
> > +				orig_es.es_len - len2;
> > +			ext4_es_store_pblock(&newes, block);
> > +			ext4_es_store_status(&newes, ext4_es_status(&orig_es));
> > +			err = __es_insert_extent(inode, &newes);
> > +			if (err) {
> > +				es->es_lblk = orig_es.es_lblk;
> > +				es->es_len = orig_es.es_len;
> > +				es->es_pblk = orig_es.es_pblk;
> > +				goto out;
> > +			}
> > +
> > +			conv_es.es_lblk = orig_es.es_lblk + len1;
> > +			conv_es.es_len = orig_es.es_len - len1 - len2;
> > +			block = ext4_es_pblock(&orig_es) + len1;
> > +			ext4_es_store_pblock(&conv_es, block);
> > +			ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
> > +			err = __es_insert_extent(inode, &conv_es);
> > +			if (err) {
> > +				int err2 = __es_remove_extent(inode,
> > +							conv_es.es_lblk,
> > +							ext4_es_end(&newes));
> > +				if (err2)
> > +					goto out;
> > +				es->es_lblk = orig_es.es_lblk;
> > +				es->es_len = orig_es.es_len;
> > +				es->es_pblk = orig_es.es_pblk;
> > +				goto out;
> > +			}
> > +		} else {
> > +			es->es_lblk = end + 1;
> > +			es->es_len = len2;
> > +			block = ext4_es_pblock(&orig_es) +
> > +				orig_es.es_len - len2;
> > +			ext4_es_store_pblock(es, block);
> > +
> > +			conv_es.es_lblk = orig_es.es_lblk;
> > +			conv_es.es_len = orig_es.es_len - len2;
> > +			ext4_es_store_pblock(&conv_es,
> > +					     ext4_es_pblock(&orig_es));
> > +			ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
> > +			err = __es_insert_extent(inode, &conv_es);
> > +			if (err) {
> > +				es->es_lblk = orig_es.es_lblk;
> > +				es->es_len = orig_es.es_len;
> > +				es->es_pblk = orig_es.es_pblk;
> > +			}
> > +		}
> > +		goto out;
> > +	}
> > +
> > +	if (len1 > 0) {
> > +		node = rb_next(&es->rb_node);
> > +		if (node)
> > +			es = rb_entry(node, struct extent_status, rb_node);
> > +		else
> > +			es = NULL;
> > +	}
> > +
> > +	while (es && ext4_es_end(es) <= end) {
> > +		node = rb_next(&es->rb_node);
> > +		ext4_es_store_status(es, EXTENT_STATUS_WRITTEN);
> > +		if (!inode) {
> > +			es = NULL;
> > +			break;
> > +		}
> > +		es = rb_entry(node, struct extent_status, rb_node);
> > +	}
> > +
> > +	if (es && es->es_lblk < end + 1) {
> > +		ext4_lblk_t orig_len = es->es_len;
> > +
> > +		/*
> > +		 * Here we first set conv_es just because of avoiding copy the
> > +		 * value of es to a temporary variable.
> > +		 */
> > +		len1 = ext4_es_end(es) - end;
> > +		conv_es.es_lblk = es->es_lblk;
> > +		conv_es.es_len = es->es_len - len1;
> > +		ext4_es_store_pblock(&conv_es, ext4_es_pblock(es));
> > +		ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
> > +
> > +		es->es_lblk = end + 1;
> > +		es->es_len = len1;
> > +		block = ext4_es_pblock(es) + orig_len - len1;
> > +		ext4_es_store_pblock(es, block);
> > +
> > +		err = __es_insert_extent(inode, &conv_es);
> > +		if (err)
> > +			goto out;
> > +	}
> > +
> > +out:
> > +	write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
> > +	return err;
> > +}
>   Is this really needed? Why don't you just use ext4_es_insert_extent() to
> insert new extent of proper type? Also the way you wrote it, we can return
> (freshly written) data to the user, then reclaim the extent status from
> memory and later return 0s because we read the original status from disk
> (conversion hasn't still happened on disk). That would be certainly
> confusing.

Yes, you are right.  So it seems that a new flag called
EXTENT_STATUS_DIRTY need to be defined to prevent memory reclaim from
shrinker.

Thanks,
                                                - Zheng
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9f21430..a03cabf 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4443,8 +4443,10 @@  int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 			ret = PTR_ERR(handle);
 			break;
 		}
-		ret = ext4_map_blocks(handle, inode, &map,
-				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
+		down_write(&EXT4_I(inode)->i_data_sem);
+		ret = ext4_ext_map_blocks(handle, inode, &map,
+					  EXT4_GET_BLOCKS_IO_CONVERT_EXT);
+		up_write(&EXT4_I(inode)->i_data_sem);
 		if (ret <= 0) {
 			WARN_ON(ret <= 0);
 			ext4_msg(inode->i_sb, KERN_ERR,
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index bac5286..eab8893 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -248,10 +248,11 @@  ext4_lblk_t ext4_es_find_delayed_extent(struct inode *inode,
 	struct extent_status *es1 = NULL;
 	struct rb_node *node;
 	ext4_lblk_t ret = EXT_MAX_BLOCKS;
+	unsigned long flags;
 
 	trace_ext4_es_find_delayed_extent_enter(inode, es->es_lblk);
 
-	read_lock(&EXT4_I(inode)->i_es_lock);
+	read_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
 	tree = &EXT4_I(inode)->i_es_tree;
 
 	/* find extent in cache firstly */
@@ -291,7 +292,7 @@  out:
 		}
 	}
 
-	read_unlock(&EXT4_I(inode)->i_es_lock);
+	read_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
 
 	ext4_es_lru_add(inode);
 	trace_ext4_es_find_delayed_extent_exit(inode, es, ret);
@@ -458,6 +459,7 @@  int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
 {
 	struct extent_status newes;
 	ext4_lblk_t end = lblk + len - 1;
+	unsigned long flags;
 	int err = 0;
 
 	es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n",
@@ -471,14 +473,14 @@  int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
 	ext4_es_store_status(&newes, status);
 	trace_ext4_es_insert_extent(inode, &newes);
 
-	write_lock(&EXT4_I(inode)->i_es_lock);
+	write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
 	err = __es_remove_extent(inode, lblk, end);
 	if (err != 0)
 		goto error;
 	err = __es_insert_extent(inode, &newes);
 
 error:
-	write_unlock(&EXT4_I(inode)->i_es_lock);
+	write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
 
 	ext4_es_lru_add(inode);
 	ext4_es_print_tree(inode);
@@ -498,13 +500,14 @@  int ext4_es_lookup_extent(struct inode *inode, struct extent_status *es)
 	struct ext4_es_tree *tree;
 	struct extent_status *es1 = NULL;
 	struct rb_node *node;
+	unsigned long flags;
 	int found = 0;
 
 	trace_ext4_es_lookup_extent_enter(inode, es->es_lblk);
 	es_debug("lookup extent in block %u\n", es->es_lblk);
 
 	tree = &EXT4_I(inode)->i_es_tree;
-	read_lock(&EXT4_I(inode)->i_es_lock);
+	read_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
 
 	/* find extent in cache firstly */
 	es->es_len = es->es_pblk = 0;
@@ -539,7 +542,7 @@  out:
 		es->es_pblk = es1->es_pblk;
 	}
 
-	read_unlock(&EXT4_I(inode)->i_es_lock);
+	read_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
 
 	ext4_es_lru_add(inode);
 	trace_ext4_es_lookup_extent_exit(inode, es, found);
@@ -649,6 +652,7 @@  int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 			  ext4_lblk_t len)
 {
 	ext4_lblk_t end;
+	unsigned long flags;
 	int err = 0;
 
 	trace_ext4_es_remove_extent(inode, lblk, len);
@@ -658,9 +662,9 @@  int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 	end = lblk + len - 1;
 	BUG_ON(end < lblk);
 
-	write_lock(&EXT4_I(inode)->i_es_lock);
+	write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
 	err = __es_remove_extent(inode, lblk, end);
-	write_unlock(&EXT4_I(inode)->i_es_lock);
+	write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
 	ext4_es_print_tree(inode);
 	return err;
 }
@@ -671,6 +675,7 @@  static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
 					struct ext4_sb_info, s_es_shrinker);
 	struct ext4_inode_info *ei;
 	struct list_head *cur, *tmp, scanned;
+	unsigned long flags;
 	int nr_to_scan = sc->nr_to_scan;
 	int ret, nr_shrunk = 0;
 
@@ -687,16 +692,16 @@  static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
 
 		ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
 
-		read_lock(&ei->i_es_lock);
+		read_lock_irqsave(&ei->i_es_lock, flags);
 		if (ei->i_es_lru_nr == 0) {
-			read_unlock(&ei->i_es_lock);
+			read_unlock_irqrestore(&ei->i_es_lock, flags);
 			continue;
 		}
-		read_unlock(&ei->i_es_lock);
+		read_unlock_irqrestore(&ei->i_es_lock, flags);
 
-		write_lock(&ei->i_es_lock);
+		write_lock_irqsave(&ei->i_es_lock, flags);
 		ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
-		write_unlock(&ei->i_es_lock);
+		write_unlock_irqrestore(&ei->i_es_lock, flags);
 
 		nr_shrunk += ret;
 		nr_to_scan -= ret;
@@ -756,14 +761,15 @@  static int ext4_es_reclaim_extents_count(struct super_block *sb)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_inode_info *ei;
 	struct list_head *cur;
+	unsigned long flags;
 	int nr_cached = 0;
 
 	spin_lock(&sbi->s_es_lru_lock);
 	list_for_each(cur, &sbi->s_es_lru) {
 		ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
-		read_lock(&ei->i_es_lock);
+		read_lock_irqsave(&ei->i_es_lock, flags);
 		nr_cached += ei->i_es_lru_nr;
-		read_unlock(&ei->i_es_lock);
+		read_unlock_irqrestore(&ei->i_es_lock, flags);
 	}
 	spin_unlock(&sbi->s_es_lru_lock);
 	trace_ext4_es_reclaim_extents_count(sb, nr_cached);
@@ -801,3 +807,147 @@  static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
 	tree->cache_es = NULL;
 	return nr_shrunk;
 }
+
+int ext4_es_convert_unwritten_extents(struct inode *inode, loff_t offset,
+				      size_t size)
+{
+	struct ext4_es_tree *tree;
+	struct rb_node *node;
+	struct extent_status *es, orig_es, conv_es;
+	ext4_lblk_t end, len1, len2;
+	ext4_lblk_t lblk = 0, len = 0;
+	ext4_fsblk_t block;
+	unsigned long flags;
+	unsigned int blkbits;
+	int err = 0;
+
+	trace_ext4_es_convert_unwritten_extents(inode, offset, size);
+	blkbits = inode->i_blkbits;
+	lblk = offset >> blkbits;
+	len = (EXT4_BLOCK_ALIGN(offset + size, blkbits) >> blkbits) - lblk;
+
+	end = lblk + len - 1;
+	BUG_ON(end < lblk);
+
+	tree = &EXT4_I(inode)->i_es_tree;
+
+	write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
+	es = __es_tree_search(&tree->root, lblk);
+	if (!es)
+		goto out;
+	if (es->es_lblk > end)
+		goto out;
+
+	tree->cache_es = NULL;
+
+	orig_es.es_lblk = es->es_lblk;
+	orig_es.es_len = es->es_len;
+	orig_es.es_pblk = es->es_pblk;
+
+	len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
+	len2 = ext4_es_end(es) > end ?
+	       ext4_es_end(es) - end : 0;
+	if (len1 > 0)
+		es->es_len = len1;
+	if (len2 > 0) {
+		if (len1 > 0) {
+			struct extent_status newes;
+
+			newes.es_lblk = end + 1;
+			newes.es_len = len2;
+			block = ext4_es_pblock(&orig_es) +
+				orig_es.es_len - len2;
+			ext4_es_store_pblock(&newes, block);
+			ext4_es_store_status(&newes, ext4_es_status(&orig_es));
+			err = __es_insert_extent(inode, &newes);
+			if (err) {
+				es->es_lblk = orig_es.es_lblk;
+				es->es_len = orig_es.es_len;
+				es->es_pblk = orig_es.es_pblk;
+				goto out;
+			}
+
+			conv_es.es_lblk = orig_es.es_lblk + len1;
+			conv_es.es_len = orig_es.es_len - len1 - len2;
+			block = ext4_es_pblock(&orig_es) + len1;
+			ext4_es_store_pblock(&conv_es, block);
+			ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
+			err = __es_insert_extent(inode, &conv_es);
+			if (err) {
+				int err2 = __es_remove_extent(inode,
+							conv_es.es_lblk,
+							ext4_es_end(&newes));
+				if (err2)
+					goto out;
+				es->es_lblk = orig_es.es_lblk;
+				es->es_len = orig_es.es_len;
+				es->es_pblk = orig_es.es_pblk;
+				goto out;
+			}
+		} else {
+			es->es_lblk = end + 1;
+			es->es_len = len2;
+			block = ext4_es_pblock(&orig_es) +
+				orig_es.es_len - len2;
+			ext4_es_store_pblock(es, block);
+
+			conv_es.es_lblk = orig_es.es_lblk;
+			conv_es.es_len = orig_es.es_len - len2;
+			ext4_es_store_pblock(&conv_es,
+					     ext4_es_pblock(&orig_es));
+			ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
+			err = __es_insert_extent(inode, &conv_es);
+			if (err) {
+				es->es_lblk = orig_es.es_lblk;
+				es->es_len = orig_es.es_len;
+				es->es_pblk = orig_es.es_pblk;
+			}
+		}
+		goto out;
+	}
+
+	if (len1 > 0) {
+		node = rb_next(&es->rb_node);
+		if (node)
+			es = rb_entry(node, struct extent_status, rb_node);
+		else
+			es = NULL;
+	}
+
+	while (es && ext4_es_end(es) <= end) {
+		node = rb_next(&es->rb_node);
+		ext4_es_store_status(es, EXTENT_STATUS_WRITTEN);
+		if (!inode) {
+			es = NULL;
+			break;
+		}
+		es = rb_entry(node, struct extent_status, rb_node);
+	}
+
+	if (es && es->es_lblk < end + 1) {
+		ext4_lblk_t orig_len = es->es_len;
+
+		/*
+		 * Here we first set conv_es just because of avoiding copy the
+		 * value of es to a temporary variable.
+		 */
+		len1 = ext4_es_end(es) - end;
+		conv_es.es_lblk = es->es_lblk;
+		conv_es.es_len = es->es_len - len1;
+		ext4_es_store_pblock(&conv_es, ext4_es_pblock(es));
+		ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
+
+		es->es_lblk = end + 1;
+		es->es_len = len1;
+		block = ext4_es_pblock(es) + orig_len - len1;
+		ext4_es_store_pblock(es, block);
+
+		err = __es_insert_extent(inode, &conv_es);
+		if (err)
+			goto out;
+	}
+
+out:
+	write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
+	return err;
+}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 938ad2b..2849d74 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -54,6 +54,8 @@  extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 extern ext4_lblk_t ext4_es_find_delayed_extent(struct inode *inode,
 					       struct extent_status *es);
 extern int ext4_es_lookup_extent(struct inode *inode, struct extent_status *es);
+extern int ext4_es_convert_unwritten_extents(struct inode *inode, loff_t offset,
+					     size_t size);
 
 static inline int ext4_es_is_written(struct extent_status *es)
 {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 670779a..08cf720 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3063,6 +3063,7 @@  out:
 		io_end->result = ret;
 	}
 
+	ext4_es_convert_unwritten_extents(inode, offset, size);
 	ext4_add_complete_io(io_end);
 }
 
@@ -3088,6 +3089,7 @@  static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
 	 */
 	inode = io_end->inode;
 	ext4_set_io_unwritten_flag(inode, io_end);
+	ext4_es_convert_unwritten_extents(inode, io_end->offset, io_end->size);
 	ext4_add_complete_io(io_end);
 out:
 	bh->b_private = NULL;
@@ -3246,6 +3248,9 @@  static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 	} else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
 						EXT4_STATE_DIO_UNWRITTEN)) {
 		int err;
+		err = ext4_es_convert_unwritten_extents(inode, offset, ret);
+		if (err)
+			ret = err;
 		/*
 		 * for non AIO case, since the IO is already
 		 * completed, we could do the conversion right here
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 0016fbc..66ea30e 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -276,6 +276,13 @@  static void ext4_end_bio(struct bio *bio, int error)
 		error = 0;
 	bio_put(bio);
 
+	/*
+	 * We need to convert unwrittne extents in extent status tree before
+	 * end_page_writeback() is called.  Otherwise, when dioread_nolock is
+	 * enabled, we will be likely to read stale data.
+	 */
+	inode = io_end->inode;
+	ext4_es_convert_unwritten_extents(inode, io_end->offset, io_end->size);
 	for (i = 0; i < io_end->num_io_pages; i++) {
 		struct page *page = io_end->pages[i]->p_page;
 		struct buffer_head *bh, *head;
@@ -305,7 +312,6 @@  static void ext4_end_bio(struct bio *bio, int error)
 		put_io_page(io_end->pages[i]);
 	}
 	io_end->num_io_pages = 0;
-	inode = io_end->inode;
 
 	if (error) {
 		io_end->flag |= EXT4_IO_END_ERROR;
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index f0734b3..d32e3d5 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -2233,6 +2233,31 @@  TRACE_EVENT(ext4_es_lookup_extent_exit,
 		  __entry->found ? __entry->status : 0)
 );
 
+TRACE_EVENT(ext4_es_convert_unwritten_extents,
+	TP_PROTO(struct inode *inode, loff_t offset, loff_t size),
+
+	TP_ARGS(inode, offset, size),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	loff_t,	offset			)
+		__field(	loff_t, size			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->offset	= offset;
+		__entry->size	= size;
+	),
+
+	TP_printk("dev %d,%d ino %lu convert unwritten extents [%llu/%llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  __entry->offset, __entry->size)
+);
+
 TRACE_EVENT(ext4_es_reclaim_extents_count,
 	TP_PROTO(struct super_block *sb, int nr_cached),