Patchwork [05/10,v5] ext4: lookup block mapping in extent status tree

login
register
mail settings
Submitter Zheng Liu
Date Feb. 8, 2013, 8:44 a.m.
Message ID <1360313046-9876-6-git-send-email-wenqing.lz@taobao.com>
Download mbox | patch
Permalink /patch/219061/
State Superseded
Headers show

Comments

Zheng Liu - Feb. 8, 2013, 8:44 a.m.
From: Zheng Liu <wenqing.lz@taobao.com>

After tracking all extent status, we already have a extent cache in
memory.  Every time we want to lookup a block mapping, we can first
try to lookup it in extent status tree to avoid a potential disk I/O.

A new function called ext4_es_lookup_extent is defined to finish this
work.  When we try to lookup a block mapping, we always call
ext4_map_blocks and/or ext4_da_map_blocks.  So in these functions we
first try to lookup a block mapping in extent status tree.

A new flag EXT4_GET_BLOCKS_NO_PUT_HOLE is used in ext4_da_map_blocks
in order not to put a hole into extent status tree because this hole
will be converted to delayed extent in the tree immediately.

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Jan kara <jack@suse.cz>
---
 fs/ext4/ext4.h              |  2 ++
 fs/ext4/extents.c           |  7 ++++-
 fs/ext4/extents_status.c    | 59 +++++++++++++++++++++++++++++++++++++++++
 fs/ext4/extents_status.h    |  1 +
 fs/ext4/inode.c             | 64 +++++++++++++++++++++++++++++++++++++++++++--
 include/trace/events/ext4.h | 56 +++++++++++++++++++++++++++++++++++++++
 6 files changed, 186 insertions(+), 3 deletions(-)
Jan Kara - Feb. 12, 2013, 12:31 p.m.
On Fri 08-02-13 16:44:01, Zheng Liu wrote:
> From: Zheng Liu <wenqing.lz@taobao.com>
> 
> After tracking all extent status, we already have a extent cache in
> memory.  Every time we want to lookup a block mapping, we can first
> try to lookup it in extent status tree to avoid a potential disk I/O.
> 
> A new function called ext4_es_lookup_extent is defined to finish this
> work.  When we try to lookup a block mapping, we always call
> ext4_map_blocks and/or ext4_da_map_blocks.  So in these functions we
> first try to lookup a block mapping in extent status tree.
> 
> A new flag EXT4_GET_BLOCKS_NO_PUT_HOLE is used in ext4_da_map_blocks
> in order not to put a hole into extent status tree because this hole
> will be converted to delayed extent in the tree immediately.
  It looks somewhat inconsistent that you put hole into the extent tree in
ext4_ext_map_blocks() but all other extent types are handled in
ext4_map_blocks() or ext4_da_map_blocks(). Can we put the handling in one
place?

Otherwise the patch looks OK.

								Honza

> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> Cc: "Theodore Ts'o" <tytso@mit.edu>
> Cc: Jan kara <jack@suse.cz>
> ---
>  fs/ext4/ext4.h              |  2 ++
>  fs/ext4/extents.c           |  7 ++++-
>  fs/ext4/extents_status.c    | 59 +++++++++++++++++++++++++++++++++++++++++
>  fs/ext4/extents_status.h    |  1 +
>  fs/ext4/inode.c             | 64 +++++++++++++++++++++++++++++++++++++++++++--
>  include/trace/events/ext4.h | 56 +++++++++++++++++++++++++++++++++++++++
>  6 files changed, 186 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 8462eb3..ad885b5 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -582,6 +582,8 @@ enum {
>  #define EXT4_GET_BLOCKS_KEEP_SIZE		0x0080
>  	/* Do not take i_data_sem locking in ext4_map_blocks */
>  #define EXT4_GET_BLOCKS_NO_LOCK			0x0100
> +	/* Do not put hole in extent cache */
> +#define EXT4_GET_BLOCKS_NO_PUT_HOLE		0x0200
>  
>  /*
>   * Flags used by ext4_free_blocks
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 4b065ff..1be8955 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -2154,6 +2154,8 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
>  				block,
>  				le32_to_cpu(ex->ee_block),
>  				 ext4_ext_get_actual_len(ex));
> +		ext4_es_insert_extent(inode, lblock, len, ~0,
> +				      EXTENT_STATUS_HOLE);
>  	} else if (block >= le32_to_cpu(ex->ee_block)
>  			+ ext4_ext_get_actual_len(ex)) {
>  		ext4_lblk_t next;
> @@ -2167,6 +2169,8 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
>  				block);
>  		BUG_ON(next == lblock);
>  		len = next - lblock;
> +		ext4_es_insert_extent(inode, lblock, len, ~0,
> +				      EXTENT_STATUS_HOLE);
>  	} else {
>  		lblock = len = 0;
>  		BUG();
> @@ -4006,7 +4010,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
>  		 * put just found gap into cache to speed up
>  		 * subsequent requests
>  		 */
> -		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
> +		if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0)
> +			ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
>  		goto out2;
>  	}
>  
> diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
> index 71cb75a..ca7dc9f 100644
> --- a/fs/ext4/extents_status.c
> +++ b/fs/ext4/extents_status.c
> @@ -468,6 +468,65 @@ error:
>  	return err;
>  }
>  
> +/*
> + * ext4_es_lookup_extent() looks up an extent in extent status tree.
> + *
> + * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
> + *
> + * Return: 1 on found, 0 on not
> + */
> +int ext4_es_lookup_extent(struct inode *inode, struct extent_status *es)
> +{
> +	struct ext4_es_tree *tree;
> +	struct extent_status *es1 = NULL;
> +	struct rb_node *node;
> +	int found = 0;
> +
> +	trace_ext4_es_lookup_extent_enter(inode, es->es_lblk);
> +	es_debug("lookup extent in block %u\n", es->es_lblk);
> +
> +	tree = &EXT4_I(inode)->i_es_tree;
> +	read_lock(&EXT4_I(inode)->i_es_lock);
> +
> +	/* find extent in cache firstly */
> +	es->es_len = es->es_pblk = 0;
> +	if (tree->cache_es) {
> +		es1 = tree->cache_es;
> +		if (in_range(es->es_lblk, es1->es_lblk, es1->es_len)) {
> +			es_debug("%u cached by [%u/%u)\n",
> +				 es->es_lblk, es1->es_lblk, es1->es_len);
> +			found = 1;
> +			goto out;
> +		}
> +	}
> +
> +	node = tree->root.rb_node;
> +	while (node) {
> +		es1 = rb_entry(node, struct extent_status, rb_node);
> +		if (es->es_lblk < es1->es_lblk)
> +			node = node->rb_left;
> +		else if (es->es_lblk > ext4_es_end(es1))
> +			node = node->rb_right;
> +		else {
> +			found = 1;
> +			break;
> +		}
> +	}
> +
> +out:
> +	if (found) {
> +		BUG_ON(!es1);
> +		es->es_lblk = es1->es_lblk;
> +		es->es_len = es1->es_len;
> +		es->es_pblk = es1->es_pblk;
> +	}
> +
> +	read_unlock(&EXT4_I(inode)->i_es_lock);
> +
> +	trace_ext4_es_lookup_extent_exit(inode, es, found);
> +	return found;
> +}
> +
>  static int __es_remove_extent(struct ext4_es_tree *tree, ext4_lblk_t lblk,
>  				 ext4_lblk_t end)
>  {
> diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
> index b5788eb..effe78c 100644
> --- a/fs/ext4/extents_status.h
> +++ b/fs/ext4/extents_status.h
> @@ -53,6 +53,7 @@ extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
>  				 ext4_lblk_t len);
>  extern ext4_lblk_t ext4_es_find_delayed_extent(struct inode *inode,
>  					       struct extent_status *es);
> +extern int ext4_es_lookup_extent(struct inode *inode, struct extent_status *es);
>  
>  static inline int ext4_es_is_written(struct extent_status *es)
>  {
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 16454fc..670779a 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -508,12 +508,34 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
>  int ext4_map_blocks(handle_t *handle, struct inode *inode,
>  		    struct ext4_map_blocks *map, int flags)
>  {
> +	struct extent_status es;
>  	int retval;
>  
>  	map->m_flags = 0;
>  	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
>  		  "logical block %lu\n", inode->i_ino, flags, map->m_len,
>  		  (unsigned long) map->m_lblk);
> +
> +	/* Lookup extent status tree firstly */
> +	es.es_lblk = map->m_lblk;
> +	if (ext4_es_lookup_extent(inode, &es)) {
> +		if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
> +			map->m_pblk = ext4_es_pblock(&es) +
> +					map->m_lblk - es.es_lblk;
> +			map->m_flags |= ext4_es_is_written(&es) ?
> +					EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
> +			retval = es.es_len - (map->m_lblk - es.es_lblk);
> +			if (retval > map->m_len)
> +				retval = map->m_len;
> +			map->m_len = retval;
> +		} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
> +			retval = 0;
> +		} else {
> +			BUG_ON(1);
> +		}
> +		goto found;
> +	}
> +
>  	/*
>  	 * Try to see if we can get the block without requesting a new
>  	 * file system block.
> @@ -541,6 +563,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
>  	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
>  		up_read((&EXT4_I(inode)->i_data_sem));
>  
> +found:
>  	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
>  		int ret = check_block_validity(inode, map);
>  		if (ret != 0)
> @@ -1772,6 +1795,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
>  			      struct ext4_map_blocks *map,
>  			      struct buffer_head *bh)
>  {
> +	struct extent_status es;
>  	int retval;
>  	sector_t invalid_block = ~((sector_t) 0xffff);
>  
> @@ -1782,6 +1806,39 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
>  	ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
>  		  "logical block %lu\n", inode->i_ino, map->m_len,
>  		  (unsigned long) map->m_lblk);
> +
> +	/* Lookup extent status tree firstly */
> +	es.es_lblk = iblock;
> +	if (ext4_es_lookup_extent(inode, &es)) {
> +
> +		if (ext4_es_is_hole(&es)) {
> +			retval = 0;
> +			down_read((&EXT4_I(inode)->i_data_sem));
> +			goto add_delayed;
> +		}
> +
> +		if (ext4_es_is_delayed(&es)) {
> +			map_bh(bh, inode->i_sb, invalid_block);
> +			set_buffer_new(bh);
> +			set_buffer_delay(bh);
> +			return 0;
> +		}
> +
> +		map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
> +		retval = es.es_len - (iblock - es.es_lblk);
> +		if (retval > map->m_len)
> +			retval = map->m_len;
> +		map->m_len = retval;
> +		if (ext4_es_is_written(&es))
> +			map->m_flags |= EXT4_MAP_MAPPED;
> +		else if (ext4_es_is_unwritten(&es))
> +			map->m_flags |= EXT4_MAP_UNWRITTEN;
> +		else
> +			BUG_ON(1);
> +
> +		return retval;
> +	}
> +
>  	/*
>  	 * Try to see if we can get the block without requesting a new
>  	 * file system block.
> @@ -1800,10 +1857,13 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
>  			map->m_flags |= EXT4_MAP_FROM_CLUSTER;
>  		retval = 0;
>  	} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> -		retval = ext4_ext_map_blocks(NULL, inode, map, 0);
> +		retval = ext4_ext_map_blocks(NULL, inode, map,
> +					     EXT4_GET_BLOCKS_NO_PUT_HOLE);
>  	else
> -		retval = ext4_ind_map_blocks(NULL, inode, map, 0);
> +		retval = ext4_ind_map_blocks(NULL, inode, map,
> +					     EXT4_GET_BLOCKS_NO_PUT_HOLE);
>  
> +add_delayed:
>  	if (retval == 0) {
>  		int ret;
>  		/*
> diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
> index d278ced..822780a 100644
> --- a/include/trace/events/ext4.h
> +++ b/include/trace/events/ext4.h
> @@ -2177,6 +2177,62 @@ TRACE_EVENT(ext4_es_find_delayed_extent_exit,
>  		  __entry->pblk, __entry->status, __entry->ret)
>  );
>  
> +TRACE_EVENT(ext4_es_lookup_extent_enter,
> +	TP_PROTO(struct inode *inode, ext4_lblk_t lblk),
> +
> +	TP_ARGS(inode, lblk),
> +
> +	TP_STRUCT__entry(
> +		__field(	dev_t,		dev		)
> +		__field(	ino_t,		ino		)
> +		__field(	ext4_lblk_t,	lblk		)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dev	= inode->i_sb->s_dev;
> +		__entry->ino	= inode->i_ino;
> +		__entry->lblk	= lblk;
> +	),
> +
> +	TP_printk("dev %d,%d ino %lu lblk %u",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  (unsigned long) __entry->ino, __entry->lblk)
> +);
> +
> +TRACE_EVENT(ext4_es_lookup_extent_exit,
> +	TP_PROTO(struct inode *inode, struct extent_status *es,
> +		 int found),
> +
> +	TP_ARGS(inode, es, found),
> +
> +	TP_STRUCT__entry(
> +		__field(	dev_t,		dev		)
> +		__field(	ino_t,		ino		)
> +		__field(	ext4_lblk_t,	lblk		)
> +		__field(	ext4_lblk_t,	len		)
> +		__field(	ext4_fsblk_t,	pblk		)
> +		__field(	unsigned long long,	status	)
> +		__field(	int,		found		)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dev	= inode->i_sb->s_dev;
> +		__entry->ino	= inode->i_ino;
> +		__entry->lblk	= es->es_lblk;
> +		__entry->len	= es->es_len;
> +		__entry->pblk	= ext4_es_pblock(es);
> +		__entry->status	= ext4_es_status(es);
> +		__entry->found	= found;
> +	),
> +
> +	TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %llx",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  (unsigned long) __entry->ino, __entry->found,
> +		  __entry->lblk, __entry->len,
> +		  __entry->found ? __entry->pblk : 0,
> +		  __entry->found ? __entry->status : 0)
> +);
> +
>  #endif /* _TRACE_EXT4_H */
>  
>  /* This part must be outside protection */
> -- 
> 1.7.12.rc2.18.g61b472e
>
Zheng Liu - Feb. 15, 2013, 7:06 a.m.
On Tue, Feb 12, 2013 at 01:31:42PM +0100, Jan Kara wrote:
> On Fri 08-02-13 16:44:01, Zheng Liu wrote:
> > From: Zheng Liu <wenqing.lz@taobao.com>
> > 
> > After tracking all extent status, we already have a extent cache in
> > memory.  Every time we want to lookup a block mapping, we can first
> > try to lookup it in extent status tree to avoid a potential disk I/O.
> > 
> > A new function called ext4_es_lookup_extent is defined to finish this
> > work.  When we try to lookup a block mapping, we always call
> > ext4_map_blocks and/or ext4_da_map_blocks.  So in these functions we
> > first try to lookup a block mapping in extent status tree.
> > 
> > A new flag EXT4_GET_BLOCKS_NO_PUT_HOLE is used in ext4_da_map_blocks
> > in order not to put a hole into extent status tree because this hole
> > will be converted to delayed extent in the tree immediately.
>   It looks somewhat inconsistent that you put hole into the extent tree in
> ext4_ext_map_blocks() but all other extent types are handled in
> ext4_map_blocks() or ext4_da_map_blocks(). Can we put the handling in one
> place?

It seems that putting all handlings in one place is too complex because
ext4_da_map_blocks() calls ext4_ext_map_blocks() and ext4_ind_map_blocks()
directly.  So now we put all extent except hole in ext4_map_blocks() and
ext4_da_map_blocks().  For the hole, it will be inserted into the status
tree in ext4_ext_put_gap_in_cache().  In this function we can get the
the length of the hole.  If we handle it in ext4_da_map_blocks() or
ext4_map_blocks(), we only can insert a hole which the length of this
hole is 1 because in these functions we couldn't know the length of the
hole.

I am planning to refine the get_block_t and *map_blocks functions.  At
that time I will try to fix this problem.

Thanks,
                                                - Zheng
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara - Feb. 15, 2013, 4:47 p.m.
On Fri 15-02-13 15:06:26, Zheng Liu wrote:
> On Tue, Feb 12, 2013 at 01:31:42PM +0100, Jan Kara wrote:
> > On Fri 08-02-13 16:44:01, Zheng Liu wrote:
> > > From: Zheng Liu <wenqing.lz@taobao.com>
> > > 
> > > After tracking all extent status, we already have a extent cache in
> > > memory.  Every time we want to lookup a block mapping, we can first
> > > try to lookup it in extent status tree to avoid a potential disk I/O.
> > > 
> > > A new function called ext4_es_lookup_extent is defined to finish this
> > > work.  When we try to lookup a block mapping, we always call
> > > ext4_map_blocks and/or ext4_da_map_blocks.  So in these functions we
> > > first try to lookup a block mapping in extent status tree.
> > > 
> > > A new flag EXT4_GET_BLOCKS_NO_PUT_HOLE is used in ext4_da_map_blocks
> > > in order not to put a hole into extent status tree because this hole
> > > will be converted to delayed extent in the tree immediately.
> >   It looks somewhat inconsistent that you put hole into the extent tree in
> > ext4_ext_map_blocks() but all other extent types are handled in
> > ext4_map_blocks() or ext4_da_map_blocks(). Can we put the handling in one
> > place?
> 
> It seems that putting all handlings in one place is too complex because
> ext4_da_map_blocks() calls ext4_ext_map_blocks() and ext4_ind_map_blocks()
> directly.  So now we put all extent except hole in ext4_map_blocks() and
> ext4_da_map_blocks().  For the hole, it will be inserted into the status
> tree in ext4_ext_put_gap_in_cache().  In this function we can get the
> the length of the hole.  If we handle it in ext4_da_map_blocks() or
> ext4_map_blocks(), we only can insert a hole which the length of this
> hole is 1 because in these functions we couldn't know the length of the
> hole.
  Ah right, thanks for explanation.

								Honza
Theodore Ts'o - Feb. 15, 2013, 5:25 p.m.
On Fri, Feb 15, 2013 at 03:06:26PM +0800, Zheng Liu wrote:
> 
> I am planning to refine the get_block_t and *map_blocks functions.  At
> that time I will try to fix this problem.

Note that get_block_t can't be changed without disrupting the Direct
I/O functions which are generic VFS functions.  There's been talk of
trying to clean up DIO, but it will probably require building a
parallel infrastructure in the generic layer, and then transitioning
individual file systems over to it.  It is definitely a mess, but it's
going to be a very tricky problem.  I suspect we'll be talking about
it at LSF/MM.

One thing thing which might be an interesting thing to do that
wouldn't require wholesale changes to generic code would be to
transition ext4_readpages() to use fs/ext4/page-io.c.  Not for this
merge window, in all likelihood, but right now we are calling
ext4_get_block() for every single page that we read in, while is
wasteful.  It would be nice if ext4_readpages() called
ext4_map_blocks() for each extent, and then submitted it using the
page-io.c functions so we don't end up calling into ext4_map_blocks()
quite as much.

That will ease our scalability and remove locking overhead, in
addition to saving CPU for the buffered I/O readpages path.
Eventually it would be good to do this for DIO as well, but that's
going to require a lot more work, and coordination with the developers
of btrfs, xfs, etc.

					- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zheng Liu - Feb. 16, 2013, 2:32 a.m.
On Fri, Feb 15, 2013 at 12:25:49PM -0500, Theodore Ts'o wrote:
> On Fri, Feb 15, 2013 at 03:06:26PM +0800, Zheng Liu wrote:
> > 
> > I am planning to refine the get_block_t and *map_blocks functions.  At
> > that time I will try to fix this problem.
> 
> Note that get_block_t can't be changed without disrupting the Direct
> I/O functions which are generic VFS functions.  There's been talk of
> trying to clean up DIO, but it will probably require building a
> parallel infrastructure in the generic layer, and then transitioning
> individual file systems over to it.  It is definitely a mess, but it's
> going to be a very tricky problem.  I suspect we'll be talking about
> it at LSF/MM.
> 
> One thing thing which might be an interesting thing to do that
> wouldn't require wholesale changes to generic code would be to
> transition ext4_readpages() to use fs/ext4/page-io.c.  Not for this
> merge window, in all likelihood, but right now we are calling
> ext4_get_block() for every single page that we read in, while is
> wasteful.  It would be nice if ext4_readpages() called
> ext4_map_blocks() for each extent, and then submitted it using the
> page-io.c functions so we don't end up calling into ext4_map_blocks()
> quite as much.
> 
> That will ease our scalability and remove locking overhead, in
> addition to saving CPU for the buffered I/O readpages path.
> Eventually it would be good to do this for DIO as well, but that's
> going to require a lot more work, and coordination with the developers
> of btrfs, xfs, etc.

To be honest, my initial idea is only to split ext4_map_blocks into
ext4_map_blocks_read and ext4_map_blocks_write, and do some cleanups.
Thanks for your suggestions.  I will look at it carefully after the
patch series of extent status tree has been applied.

Thanks,
                                                - Zheng
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o - Feb. 16, 2013, 4:18 p.m.
On Sat, Feb 16, 2013 at 10:32:51AM +0800, Zheng Liu wrote:
> 
> To be honest, my initial idea is only to split ext4_map_blocks into
> ext4_map_blocks_read and ext4_map_blocks_write, and do some cleanups.
> Thanks for your suggestions.  I will look at it carefully after the
> patch series of extent status tree has been applied.

Ah, when you said get_block_t functions, I had assumed you had meant
changing the function signature --- because the function signature
being fixed by the generic DIO code is one of the things holding back
a number of improvements in the map_blocks code paths.

For example:

1) Thanks to the DIO code, we are ab(using) a struct buffer_head data
structure to pass the mapping to the DIO code.  Normally the
buffer_head maps only a single block's worth of data, but here b_size
is repurposed to indcate the size of the logical to physical block
mapping, and b_data is invalid (since it isn't a real buffer head).
There are a number of other fields in the struct buffer_head which in
the DIO codepath which are completely unused, which isn't just an
aesthetic issue --- it's also wasting valuable (and limited) kernel
stack space, since the struct buffer_head is allocated on the stack of
do_blockdev_direct_IO().

2) We are currently using inode flags to pass state flags between
different parts of the writepages code and the map_blocks code.  This
is bad because (a) it makes the code much harder to understand and
maintain, and (b) it blocks us from being able to call map_blocks() in
parallel.  If we fix this, it would be relatively trivial to add
support for parallel non-create map_block calls, and if we decide to
try to use the extent status tree for range locking, it might be
possible to do parallel block allocations sa well.  (I believe some
locking may be needed in mballoc.c for the inode-specific
preallocation code, but that should be doable.)

If we have multiple interested in working on various different
projects, it might be useful to start documenting some of these
proposed enhancements on the wiki, and certainly these would be good
things for us to discuss at the ext4 developer's workshop in April.

Regards,

					- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zheng Liu - Feb. 17, 2013, 3:15 a.m.
On Sat, Feb 16, 2013 at 11:18:46AM -0500, Theodore Ts'o wrote:
> On Sat, Feb 16, 2013 at 10:32:51AM +0800, Zheng Liu wrote:
> > 
> > To be honest, my initial idea is only to split ext4_map_blocks into
> > ext4_map_blocks_read and ext4_map_blocks_write, and do some cleanups.
> > Thanks for your suggestions.  I will look at it carefully after the
> > patch series of extent status tree has been applied.
> 
> Ah, when you said get_block_t functions, I had assumed you had meant
> changing the function signature --- because the function signature
> being fixed by the generic DIO code is one of the things holding back
> a number of improvements in the map_blocks code paths.
> 
> For example:
> 
> 1) Thanks to the DIO code, we are ab(using) a struct buffer_head data
> structure to pass the mapping to the DIO code.  Normally the
> buffer_head maps only a single block's worth of data, but here b_size
> is repurposed to indcate the size of the logical to physical block
> mapping, and b_data is invalid (since it isn't a real buffer head).
> There are a number of other fields in the struct buffer_head which in
> the DIO codepath which are completely unused, which isn't just an
> aesthetic issue --- it's also wasting valuable (and limited) kernel
> stack space, since the struct buffer_head is allocated on the stack of
> do_blockdev_direct_IO().

Yes, I also think struct buffer_head could be dropped, and get_block_t
signature should be changed.  But it needs to modify vfs layer.  We can
discuss this topic with other developers in LSF/MM summit and fsdevel@
mailing list.  AFAIK, this year in LSF/MM summit there is a topic about
this.

> 
> 2) We are currently using inode flags to pass state flags between
> different parts of the writepages code and the map_blocks code.  This
> is bad because (a) it makes the code much harder to understand and
> maintain, and (b) it blocks us from being able to call map_blocks() in
> parallel.  If we fix this, it would be relatively trivial to add
> support for parallel non-create map_block calls, and if we decide to
> try to use the extent status tree for range locking, it might be
> possible to do parallel block allocations sa well.  (I believe some
> locking may be needed in mballoc.c for the inode-specific
> preallocation code, but that should be doable.)

I think about extent-level locking when I run xfstests.  It seems that
we need to improve *_map_blocks() function if we want to use status tree
for range locking because we always need to lookup an extent in this
tree and lock this range before we do other things.  That is why I want
to split ext4_map_blocks function.  Certainly this is only my simple
idea, and I still need to consider it.  But I believe an improvement
and/or cleanup is a good start.

> 
> If we have multiple interested in working on various different
> projects, it might be useful to start documenting some of these
> proposed enhancements on the wiki, and certainly these would be good
> things for us to discuss at the ext4 developer's workshop in April.

Yes, there are a lot of things on ext4 wiki, which are out of date.
That would be great if it can be updated.

Looking forward to discussing this topics. :-)

                                                - Zheng
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8462eb3..ad885b5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -582,6 +582,8 @@  enum {
 #define EXT4_GET_BLOCKS_KEEP_SIZE		0x0080
 	/* Do not take i_data_sem locking in ext4_map_blocks */
 #define EXT4_GET_BLOCKS_NO_LOCK			0x0100
+	/* Do not put hole in extent cache */
+#define EXT4_GET_BLOCKS_NO_PUT_HOLE		0x0200
 
 /*
  * Flags used by ext4_free_blocks
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4b065ff..1be8955 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2154,6 +2154,8 @@  ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 				block,
 				le32_to_cpu(ex->ee_block),
 				 ext4_ext_get_actual_len(ex));
+		ext4_es_insert_extent(inode, lblock, len, ~0,
+				      EXTENT_STATUS_HOLE);
 	} else if (block >= le32_to_cpu(ex->ee_block)
 			+ ext4_ext_get_actual_len(ex)) {
 		ext4_lblk_t next;
@@ -2167,6 +2169,8 @@  ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 				block);
 		BUG_ON(next == lblock);
 		len = next - lblock;
+		ext4_es_insert_extent(inode, lblock, len, ~0,
+				      EXTENT_STATUS_HOLE);
 	} else {
 		lblock = len = 0;
 		BUG();
@@ -4006,7 +4010,8 @@  int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		 * put just found gap into cache to speed up
 		 * subsequent requests
 		 */
-		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
+		if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0)
+			ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
 		goto out2;
 	}
 
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 71cb75a..ca7dc9f 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -468,6 +468,65 @@  error:
 	return err;
 }
 
+/*
+ * ext4_es_lookup_extent() looks up an extent in extent status tree.
+ *
+ * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
+ *
+ * Return: 1 on found, 0 on not
+ */
+int ext4_es_lookup_extent(struct inode *inode, struct extent_status *es)
+{
+	struct ext4_es_tree *tree;
+	struct extent_status *es1 = NULL;
+	struct rb_node *node;
+	int found = 0;
+
+	trace_ext4_es_lookup_extent_enter(inode, es->es_lblk);
+	es_debug("lookup extent in block %u\n", es->es_lblk);
+
+	tree = &EXT4_I(inode)->i_es_tree;
+	read_lock(&EXT4_I(inode)->i_es_lock);
+
+	/* find extent in cache firstly */
+	es->es_len = es->es_pblk = 0;
+	if (tree->cache_es) {
+		es1 = tree->cache_es;
+		if (in_range(es->es_lblk, es1->es_lblk, es1->es_len)) {
+			es_debug("%u cached by [%u/%u)\n",
+				 es->es_lblk, es1->es_lblk, es1->es_len);
+			found = 1;
+			goto out;
+		}
+	}
+
+	node = tree->root.rb_node;
+	while (node) {
+		es1 = rb_entry(node, struct extent_status, rb_node);
+		if (es->es_lblk < es1->es_lblk)
+			node = node->rb_left;
+		else if (es->es_lblk > ext4_es_end(es1))
+			node = node->rb_right;
+		else {
+			found = 1;
+			break;
+		}
+	}
+
+out:
+	if (found) {
+		BUG_ON(!es1);
+		es->es_lblk = es1->es_lblk;
+		es->es_len = es1->es_len;
+		es->es_pblk = es1->es_pblk;
+	}
+
+	read_unlock(&EXT4_I(inode)->i_es_lock);
+
+	trace_ext4_es_lookup_extent_exit(inode, es, found);
+	return found;
+}
+
 static int __es_remove_extent(struct ext4_es_tree *tree, ext4_lblk_t lblk,
 				 ext4_lblk_t end)
 {
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index b5788eb..effe78c 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -53,6 +53,7 @@  extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 				 ext4_lblk_t len);
 extern ext4_lblk_t ext4_es_find_delayed_extent(struct inode *inode,
 					       struct extent_status *es);
+extern int ext4_es_lookup_extent(struct inode *inode, struct extent_status *es);
 
 static inline int ext4_es_is_written(struct extent_status *es)
 {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 16454fc..670779a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -508,12 +508,34 @@  static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 int ext4_map_blocks(handle_t *handle, struct inode *inode,
 		    struct ext4_map_blocks *map, int flags)
 {
+	struct extent_status es;
 	int retval;
 
 	map->m_flags = 0;
 	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
 		  "logical block %lu\n", inode->i_ino, flags, map->m_len,
 		  (unsigned long) map->m_lblk);
+
+	/* Lookup extent status tree firstly */
+	es.es_lblk = map->m_lblk;
+	if (ext4_es_lookup_extent(inode, &es)) {
+		if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+			map->m_pblk = ext4_es_pblock(&es) +
+					map->m_lblk - es.es_lblk;
+			map->m_flags |= ext4_es_is_written(&es) ?
+					EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
+			retval = es.es_len - (map->m_lblk - es.es_lblk);
+			if (retval > map->m_len)
+				retval = map->m_len;
+			map->m_len = retval;
+		} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+			retval = 0;
+		} else {
+			BUG_ON(1);
+		}
+		goto found;
+	}
+
 	/*
 	 * Try to see if we can get the block without requesting a new
 	 * file system block.
@@ -541,6 +563,7 @@  int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
 		up_read((&EXT4_I(inode)->i_data_sem));
 
+found:
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 		int ret = check_block_validity(inode, map);
 		if (ret != 0)
@@ -1772,6 +1795,7 @@  static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 			      struct ext4_map_blocks *map,
 			      struct buffer_head *bh)
 {
+	struct extent_status es;
 	int retval;
 	sector_t invalid_block = ~((sector_t) 0xffff);
 
@@ -1782,6 +1806,39 @@  static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 	ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
 		  "logical block %lu\n", inode->i_ino, map->m_len,
 		  (unsigned long) map->m_lblk);
+
+	/* Lookup extent status tree firstly */
+	es.es_lblk = iblock;
+	if (ext4_es_lookup_extent(inode, &es)) {
+
+		if (ext4_es_is_hole(&es)) {
+			retval = 0;
+			down_read((&EXT4_I(inode)->i_data_sem));
+			goto add_delayed;
+		}
+
+		if (ext4_es_is_delayed(&es)) {
+			map_bh(bh, inode->i_sb, invalid_block);
+			set_buffer_new(bh);
+			set_buffer_delay(bh);
+			return 0;
+		}
+
+		map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
+		retval = es.es_len - (iblock - es.es_lblk);
+		if (retval > map->m_len)
+			retval = map->m_len;
+		map->m_len = retval;
+		if (ext4_es_is_written(&es))
+			map->m_flags |= EXT4_MAP_MAPPED;
+		else if (ext4_es_is_unwritten(&es))
+			map->m_flags |= EXT4_MAP_UNWRITTEN;
+		else
+			BUG_ON(1);
+
+		return retval;
+	}
+
 	/*
 	 * Try to see if we can get the block without requesting a new
 	 * file system block.
@@ -1800,10 +1857,13 @@  static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 			map->m_flags |= EXT4_MAP_FROM_CLUSTER;
 		retval = 0;
 	} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-		retval = ext4_ext_map_blocks(NULL, inode, map, 0);
+		retval = ext4_ext_map_blocks(NULL, inode, map,
+					     EXT4_GET_BLOCKS_NO_PUT_HOLE);
 	else
-		retval = ext4_ind_map_blocks(NULL, inode, map, 0);
+		retval = ext4_ind_map_blocks(NULL, inode, map,
+					     EXT4_GET_BLOCKS_NO_PUT_HOLE);
 
+add_delayed:
 	if (retval == 0) {
 		int ret;
 		/*
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index d278ced..822780a 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -2177,6 +2177,62 @@  TRACE_EVENT(ext4_es_find_delayed_extent_exit,
 		  __entry->pblk, __entry->status, __entry->ret)
 );
 
+TRACE_EVENT(ext4_es_lookup_extent_enter,
+	TP_PROTO(struct inode *inode, ext4_lblk_t lblk),
+
+	TP_ARGS(inode, lblk),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,		dev		)
+		__field(	ino_t,		ino		)
+		__field(	ext4_lblk_t,	lblk		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->lblk	= lblk;
+	),
+
+	TP_printk("dev %d,%d ino %lu lblk %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino, __entry->lblk)
+);
+
+TRACE_EVENT(ext4_es_lookup_extent_exit,
+	TP_PROTO(struct inode *inode, struct extent_status *es,
+		 int found),
+
+	TP_ARGS(inode, es, found),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,		dev		)
+		__field(	ino_t,		ino		)
+		__field(	ext4_lblk_t,	lblk		)
+		__field(	ext4_lblk_t,	len		)
+		__field(	ext4_fsblk_t,	pblk		)
+		__field(	unsigned long long,	status	)
+		__field(	int,		found		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->lblk	= es->es_lblk;
+		__entry->len	= es->es_len;
+		__entry->pblk	= ext4_es_pblock(es);
+		__entry->status	= ext4_es_status(es);
+		__entry->found	= found;
+	),
+
+	TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino, __entry->found,
+		  __entry->lblk, __entry->len,
+		  __entry->found ? __entry->pblk : 0,
+		  __entry->found ? __entry->status : 0)
+);
+
 #endif /* _TRACE_EXT4_H */
 
 /* This part must be outside protection */