Patchwork [5/9,v4] ext4: track all extent status in extent status tree

login
register
mail settings
Submitter Zheng Liu
Date Jan. 31, 2013, 5:17 a.m.
Message ID <1359609477-29845-6-git-send-email-wenqing.lz@taobao.com>
Download mbox | patch
Permalink /patch/217083/
State Accepted
Headers show

Comments

Zheng Liu - Jan. 31, 2013, 5:17 a.m.
From: Zheng Liu <wenqing.lz@taobao.com>

By recording the phycisal block and status, extent status tree is able
to track the status of every extents.  When we call _map_blocks
functions to lookup an extent or create a new written/unwritten/delayed
extent, this extent will be inserted into extent status tree.

We don't load all extents from disk in alloc_inode() because it costs
too much memory, and if a file is opened and closed frequently it will
takes too much time to load all extent information.  So currently when
we create/lookup an extent, this extent will be inserted into extent
status tree.  Hence, the extent status tree may not comprehensively
contain all of the extents found in the file.

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c |  5 +++-
 fs/ext4/file.c    |  6 +++--
 fs/ext4/inode.c   | 73 ++++++++++++++++++++++++++++++++++++-------------------
 3 files changed, 56 insertions(+), 28 deletions(-)
Jan Kara - Jan. 31, 2013, 4:50 p.m.
On Thu 31-01-13 13:17:53, Zheng Liu wrote:
> From: Zheng Liu <wenqing.lz@taobao.com>
> 
> By recording the phycisal block and status, extent status tree is able
> to track the status of every extents.  When we call _map_blocks
> functions to lookup an extent or create a new written/unwritten/delayed
> extent, this extent will be inserted into extent status tree.
> 
> We don't load all extents from disk in alloc_inode() because it costs
> too much memory, and if a file is opened and closed frequently it will
> takes too much time to load all extent information.  So currently when
> we create/lookup an extent, this extent will be inserted into extent
> status tree.  Hence, the extent status tree may not comprehensively
> contain all of the extents found in the file.
> 
> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> Cc: "Theodore Ts'o" <tytso@mit.edu>
> ---
>  fs/ext4/extents.c |  5 +++-
>  fs/ext4/file.c    |  6 +++--
>  fs/ext4/inode.c   | 73 ++++++++++++++++++++++++++++++++++++-------------------
>  3 files changed, 56 insertions(+), 28 deletions(-)
> 
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index aa9a6d2..d23a654 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -2074,7 +2074,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
>  		}
>  
>  		/* This is possible iff next == next_del == EXT_MAX_BLOCKS */
> -		if (next == next_del) {
> +		if (next == next_del && next_del == EXT_MAX_BLOCKS) {
  This doesn't seem to be related, does it?

> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index e09c7cf..f0dda2a 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -615,18 +615,27 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
>  			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
>  			ext4_da_update_reserve_space(inode, retval, 1);
>  	}
> -	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
> +	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
>  		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
>  
> -		if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
> -			int ret;
> -delayed_mapped:
> -			/* delayed allocation blocks has been allocated */
> -			ret = ext4_es_remove_extent(inode, map->m_lblk,
> -						    map->m_len);
> -			if (ret < 0)
> -				retval = ret;
> -		}
> +	if (retval > 0) {
> +		int ret, status;
> +
> +		if (flags & EXT4_GET_BLOCKS_PRE_IO)
> +			status = EXTENT_STATUS_UNWRITTEN;
> +		else if (flags & EXT4_GET_BLOCKS_CONVERT)
> +			status = EXTENT_STATUS_WRITTEN;
> +		else if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
> +			status = EXTENT_STATUS_UNWRITTEN;
> +		else if (flags & EXT4_GET_BLOCKS_CREATE)
> +			status = EXTENT_STATUS_WRITTEN;
> +		else
> +			BUG_ON(1);
> +
> +		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
> +					    map->m_pblk, status);
> +		if (ret < 0)
> +			retval = ret;
  Hum, are you sure the extent status will be correct? Won't it be safer to
just use whatever we have in 'map'?

								Honza
Zheng Liu - Feb. 1, 2013, 5:33 a.m.
On Thu, Jan 31, 2013 at 05:50:55PM +0100, Jan Kara wrote:
> On Thu 31-01-13 13:17:53, Zheng Liu wrote:
> > From: Zheng Liu <wenqing.lz@taobao.com>
> > 
> > By recording the phycisal block and status, extent status tree is able
> > to track the status of every extents.  When we call _map_blocks
> > functions to lookup an extent or create a new written/unwritten/delayed
> > extent, this extent will be inserted into extent status tree.
> > 
> > We don't load all extents from disk in alloc_inode() because it costs
> > too much memory, and if a file is opened and closed frequently it will
> > takes too much time to load all extent information.  So currently when
> > we create/lookup an extent, this extent will be inserted into extent
> > status tree.  Hence, the extent status tree may not comprehensively
> > contain all of the extents found in the file.
> > 
> > Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> > Cc: "Theodore Ts'o" <tytso@mit.edu>
> > ---
> >  fs/ext4/extents.c |  5 +++-
> >  fs/ext4/file.c    |  6 +++--
> >  fs/ext4/inode.c   | 73 ++++++++++++++++++++++++++++++++++++-------------------
> >  3 files changed, 56 insertions(+), 28 deletions(-)
> > 
> > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> > index aa9a6d2..d23a654 100644
> > --- a/fs/ext4/extents.c
> > +++ b/fs/ext4/extents.c
> > @@ -2074,7 +2074,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
> >  		}
> >  
> >  		/* This is possible iff next == next_del == EXT_MAX_BLOCKS */
> > -		if (next == next_del) {
> > +		if (next == next_del && next_del == EXT_MAX_BLOCKS) {
>   This doesn't seem to be related, does it?

ext4_ext_next_allocated_block() will return EXT_MAX_BLOCKS when it
reaches the end of file.  ext4_find_delayed_extent() does the same
thing.  Before tracking written/unwritten extent it is correct because
next never equals to next_del unless both of them equal to
EXT_MAX_BLOCKS.  However, after that next is possible to equal to
next_del when they don't reach the end of file.  So we need to make sure
next equals to next_del and both of them equal to EXT_MAX_BLOCKS.  In
this condition it indicates that we reach the end of file.  Am I miss
something?

> 
> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > index e09c7cf..f0dda2a 100644
> > --- a/fs/ext4/inode.c
> > +++ b/fs/ext4/inode.c
> > @@ -615,18 +615,27 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
> >  			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
> >  			ext4_da_update_reserve_space(inode, retval, 1);
> >  	}
> > -	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
> > +	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
> >  		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
> >  
> > -		if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
> > -			int ret;
> > -delayed_mapped:
> > -			/* delayed allocation blocks has been allocated */
> > -			ret = ext4_es_remove_extent(inode, map->m_lblk,
> > -						    map->m_len);
> > -			if (ret < 0)
> > -				retval = ret;
> > -		}
> > +	if (retval > 0) {
> > +		int ret, status;
> > +
> > +		if (flags & EXT4_GET_BLOCKS_PRE_IO)
> > +			status = EXTENT_STATUS_UNWRITTEN;
> > +		else if (flags & EXT4_GET_BLOCKS_CONVERT)
> > +			status = EXTENT_STATUS_WRITTEN;
> > +		else if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
> > +			status = EXTENT_STATUS_UNWRITTEN;
> > +		else if (flags & EXT4_GET_BLOCKS_CREATE)
> > +			status = EXTENT_STATUS_WRITTEN;
> > +		else
> > +			BUG_ON(1);
> > +
> > +		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
> > +					    map->m_pblk, status);
> > +		if (ret < 0)
> > +			retval = ret;
>   Hum, are you sure the extent status will be correct? Won't it be safer to
> just use whatever we have in 'map'?

Your meaning is that we need to ignore the error when we insert a extent
into the extent status tree, right?  But that would causes an
inconsistency between status tree and extent tree.  Further,
ext4_es_insert_extent() returns EINVAL or ENOMEM.  I believe that
reporting an error is a better choice.  What do you think?

Thanks,
                                                - Zheng
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara - Feb. 4, 2013, 11:27 a.m.
On Fri 01-02-13 13:33:08, Zheng Liu wrote:
> On Thu, Jan 31, 2013 at 05:50:55PM +0100, Jan Kara wrote:
> > On Thu 31-01-13 13:17:53, Zheng Liu wrote:
> > > From: Zheng Liu <wenqing.lz@taobao.com>
> > > 
> > > By recording the phycisal block and status, extent status tree is able
> > > to track the status of every extents.  When we call _map_blocks
> > > functions to lookup an extent or create a new written/unwritten/delayed
> > > extent, this extent will be inserted into extent status tree.
> > > 
> > > We don't load all extents from disk in alloc_inode() because it costs
> > > too much memory, and if a file is opened and closed frequently it will
> > > takes too much time to load all extent information.  So currently when
> > > we create/lookup an extent, this extent will be inserted into extent
> > > status tree.  Hence, the extent status tree may not comprehensively
> > > contain all of the extents found in the file.
> > > 
> > > Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> > > Cc: "Theodore Ts'o" <tytso@mit.edu>
> > > ---
> > >  fs/ext4/extents.c |  5 +++-
> > >  fs/ext4/file.c    |  6 +++--
> > >  fs/ext4/inode.c   | 73 ++++++++++++++++++++++++++++++++++++-------------------
> > >  3 files changed, 56 insertions(+), 28 deletions(-)
> > > 
> > > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> > > index aa9a6d2..d23a654 100644
> > > --- a/fs/ext4/extents.c
> > > +++ b/fs/ext4/extents.c
> > > @@ -2074,7 +2074,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
> > >  		}
> > >  
> > >  		/* This is possible iff next == next_del == EXT_MAX_BLOCKS */
> > > -		if (next == next_del) {
> > > +		if (next == next_del && next_del == EXT_MAX_BLOCKS) {
> >   This doesn't seem to be related, does it?
> 
> ext4_ext_next_allocated_block() will return EXT_MAX_BLOCKS when it
> reaches the end of file.  ext4_find_delayed_extent() does the same
> thing.
  Yes.

> Before tracking written/unwritten extent it is correct because
> next never equals to next_del unless both of them equal to
> EXT_MAX_BLOCKS.  However, after that next is possible to equal to
> next_del when they don't reach the end of file.
  Hum, I have to say I'm somewhat lost in ext4_find_delayed_extent().
ext4_es_find_extent() returns the first extent from extent status tree
containing at / after the given offset. So es.len == 0 iff there's extent
in status tree at or after newex->ec_block. The comment before that
condition suggest something different and I'd expect the return value to be
EXT_MAX_BLOCKS, not 0? Also ext4_find_delayed_extent() would be much less
confusing if it just returned position of next delalloc extent after given
block and didn't try to return length of a hole before that extent in
newex? That value can be easily computed from 'next' and 'next_del' in
ext4_fill_fiemap_extents() anyway... But that's slightly off topic.

To our current discussion ext4_find_delayed_extent() either returns 0 or
start of next delalloc extent (if we found extent of other type in the
status tree we just return 0) so I don't see how next_del == next for other
value than EXT_MAX_BLOCKS. But maybe I miss something.

> So we need to make sure next equals to next_del and both of them equal to
> EXT_MAX_BLOCKS.  In this condition it indicates that we reach the end of
> file.  Am I miss something?
> 
> > 
> > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > index e09c7cf..f0dda2a 100644
> > > --- a/fs/ext4/inode.c
> > > +++ b/fs/ext4/inode.c
> > > @@ -615,18 +615,27 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
> > >  			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
> > >  			ext4_da_update_reserve_space(inode, retval, 1);
> > >  	}
> > > -	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
> > > +	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
> > >  		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
> > >  
> > > -		if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
> > > -			int ret;
> > > -delayed_mapped:
> > > -			/* delayed allocation blocks has been allocated */
> > > -			ret = ext4_es_remove_extent(inode, map->m_lblk,
> > > -						    map->m_len);
> > > -			if (ret < 0)
> > > -				retval = ret;
> > > -		}
> > > +	if (retval > 0) {
> > > +		int ret, status;
> > > +
> > > +		if (flags & EXT4_GET_BLOCKS_PRE_IO)
> > > +			status = EXTENT_STATUS_UNWRITTEN;
> > > +		else if (flags & EXT4_GET_BLOCKS_CONVERT)
> > > +			status = EXTENT_STATUS_WRITTEN;
> > > +		else if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
> > > +			status = EXTENT_STATUS_UNWRITTEN;
> > > +		else if (flags & EXT4_GET_BLOCKS_CREATE)
> > > +			status = EXTENT_STATUS_WRITTEN;
> > > +		else
> > > +			BUG_ON(1);
> > > +
> > > +		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
> > > +					    map->m_pblk, status);
> > > +		if (ret < 0)
> > > +			retval = ret;
> >   Hum, are you sure the extent status will be correct? Won't it be safer to
> > just use whatever we have in 'map'?
> 
> Your meaning is that we need to ignore the error when we insert a extent
> into the extent status tree, right?  But that would causes an
> inconsistency between status tree and extent tree.  Further,
> ext4_es_insert_extent() returns EINVAL or ENOMEM.  I believe that
> reporting an error is a better choice.  What do you think?
  No, I meant something else. For example you decide extent at given
position is 'UNWRITTEN' just on the basis that someone passed
EXT4_GET_BLOCKS_PRE_IO as get_blocks flags. How do you know? Cannot someone
pass EXT4_GET_BLOCKS_PRE_IO and we actually find out the extent at given
position is fully allocated extent (i.e. WRITTEN) so we do nothing? Then
you would cache bad state in the extent tree... That's why I'd rather see
we derive current 'status' from 'map' where we are sure to have correct
information and don't have to guess it from passed flags.

								Honza
Zheng Liu - Feb. 5, 2013, 3:32 a.m.
On Mon, Feb 04, 2013 at 12:27:09PM +0100, Jan Kara wrote:
> On Fri 01-02-13 13:33:08, Zheng Liu wrote:
> > On Thu, Jan 31, 2013 at 05:50:55PM +0100, Jan Kara wrote:
> > > On Thu 31-01-13 13:17:53, Zheng Liu wrote:
> > > > From: Zheng Liu <wenqing.lz@taobao.com>
> > > > 
> > > > By recording the phycisal block and status, extent status tree is able
> > > > to track the status of every extents.  When we call _map_blocks
> > > > functions to lookup an extent or create a new written/unwritten/delayed
> > > > extent, this extent will be inserted into extent status tree.
> > > > 
> > > > We don't load all extents from disk in alloc_inode() because it costs
> > > > too much memory, and if a file is opened and closed frequently it will
> > > > takes too much time to load all extent information.  So currently when
> > > > we create/lookup an extent, this extent will be inserted into extent
> > > > status tree.  Hence, the extent status tree may not comprehensively
> > > > contain all of the extents found in the file.
> > > > 
> > > > Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> > > > Cc: "Theodore Ts'o" <tytso@mit.edu>
> > > > ---
> > > >  fs/ext4/extents.c |  5 +++-
> > > >  fs/ext4/file.c    |  6 +++--
> > > >  fs/ext4/inode.c   | 73 ++++++++++++++++++++++++++++++++++++-------------------
> > > >  3 files changed, 56 insertions(+), 28 deletions(-)
> > > > 
> > > > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> > > > index aa9a6d2..d23a654 100644
> > > > --- a/fs/ext4/extents.c
> > > > +++ b/fs/ext4/extents.c
> > > > @@ -2074,7 +2074,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
> > > >  		}
> > > >  
> > > >  		/* This is possible iff next == next_del == EXT_MAX_BLOCKS */
> > > > -		if (next == next_del) {
> > > > +		if (next == next_del && next_del == EXT_MAX_BLOCKS) {
> > >   This doesn't seem to be related, does it?
> > 
> > ext4_ext_next_allocated_block() will return EXT_MAX_BLOCKS when it
> > reaches the end of file.  ext4_find_delayed_extent() does the same
> > thing.
>   Yes.
> 
> > Before tracking written/unwritten extent it is correct because
> > next never equals to next_del unless both of them equal to
> > EXT_MAX_BLOCKS.  However, after that next is possible to equal to
> > next_del when they don't reach the end of file.
>   Hum, I have to say I'm somewhat lost in ext4_find_delayed_extent().
> ext4_es_find_extent() returns the first extent from extent status tree
> containing at / after the given offset. So es.len == 0 iff there's extent
> in status tree at or after newex->ec_block. The comment before that
> condition suggest something different and I'd expect the return value to be
> EXT_MAX_BLOCKS, not 0? Also ext4_find_delayed_extent() would be much less
> confusing if it just returned position of next delalloc extent after given
> block and didn't try to return length of a hole before that extent in
> newex? That value can be easily computed from 'next' and 'next_del' in
> ext4_fill_fiemap_extents() anyway... But that's slightly off topic.

Yes, ext4_find_delayed_extent is not very clearly now, and I agree with
you that it need to be refactored.  Certainly it is another topic.

> 
> To our current discussion ext4_find_delayed_extent() either returns 0 or
> start of next delalloc extent (if we found extent of other type in the
> status tree we just return 0) so I don't see how next_del == next for other
> value than EXT_MAX_BLOCKS. But maybe I miss something.

After tracking all extent status in status tree, ext4_es_find_extent()
returns not only delayed extent, but also written/unwritten extents.  So
it is possible that next_del == next and its value is not
EXT_MAX_BLOCKS.  *But* in latest version ext4_es_find_extent() will be
changed to only return delayed extent.  So the problem will be fixed.

> 
> > So we need to make sure next equals to next_del and both of them equal to
> > EXT_MAX_BLOCKS.  In this condition it indicates that we reach the end of
> > file.  Am I miss something?
> > 
> > > 
> > > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > > index e09c7cf..f0dda2a 100644
> > > > --- a/fs/ext4/inode.c
> > > > +++ b/fs/ext4/inode.c
> > > > @@ -615,18 +615,27 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
> > > >  			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
> > > >  			ext4_da_update_reserve_space(inode, retval, 1);
> > > >  	}
> > > > -	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
> > > > +	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
> > > >  		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
> > > >  
> > > > -		if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
> > > > -			int ret;
> > > > -delayed_mapped:
> > > > -			/* delayed allocation blocks has been allocated */
> > > > -			ret = ext4_es_remove_extent(inode, map->m_lblk,
> > > > -						    map->m_len);
> > > > -			if (ret < 0)
> > > > -				retval = ret;
> > > > -		}
> > > > +	if (retval > 0) {
> > > > +		int ret, status;
> > > > +
> > > > +		if (flags & EXT4_GET_BLOCKS_PRE_IO)
> > > > +			status = EXTENT_STATUS_UNWRITTEN;
> > > > +		else if (flags & EXT4_GET_BLOCKS_CONVERT)
> > > > +			status = EXTENT_STATUS_WRITTEN;
> > > > +		else if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
> > > > +			status = EXTENT_STATUS_UNWRITTEN;
> > > > +		else if (flags & EXT4_GET_BLOCKS_CREATE)
> > > > +			status = EXTENT_STATUS_WRITTEN;
> > > > +		else
> > > > +			BUG_ON(1);
> > > > +
> > > > +		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
> > > > +					    map->m_pblk, status);
> > > > +		if (ret < 0)
> > > > +			retval = ret;
> > >   Hum, are you sure the extent status will be correct? Won't it be safer to
> > > just use whatever we have in 'map'?
> > 
> > Your meaning is that we need to ignore the error when we insert a extent
> > into the extent status tree, right?  But that would causes an
> > inconsistency between status tree and extent tree.  Further,
> > ext4_es_insert_extent() returns EINVAL or ENOMEM.  I believe that
> > reporting an error is a better choice.  What do you think?
>   No, I meant something else. For example you decide extent at given
> position is 'UNWRITTEN' just on the basis that someone passed
> EXT4_GET_BLOCKS_PRE_IO as get_blocks flags. How do you know? Cannot someone
> pass EXT4_GET_BLOCKS_PRE_IO and we actually find out the extent at given
> position is fully allocated extent (i.e. WRITTEN) so we do nothing? Then
> you would cache bad state in the extent tree... That's why I'd rather see
> we derive current 'status' from 'map' where we are sure to have correct
> information and don't have to guess it from passed flags.

First of all, we don't need to worry about this problem because we
always lookup an extent before trying to create it.  So when it is an
written extent, we will return from ext4_map_blocks() directly and won't
try to create it.  So status tree don't be touched.

Secondly, as far as I know, ext4_ext_map_blocks() will never return
EXT4_MAP_UNWRITTEN flags after it tries to allocate an unwritten extent.
So that means that 'm_flags' in map is incorrect.  Maybe I miss
something.

Thanks,
                                                - Zheng
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara - Feb. 5, 2013, 12:08 p.m.
On Tue 05-02-13 11:32:21, Zheng Liu wrote:
> On Mon, Feb 04, 2013 at 12:27:09PM +0100, Jan Kara wrote:
> > On Fri 01-02-13 13:33:08, Zheng Liu wrote:
> > > On Thu, Jan 31, 2013 at 05:50:55PM +0100, Jan Kara wrote:
> > > > On Thu 31-01-13 13:17:53, Zheng Liu wrote:
> > > > > From: Zheng Liu <wenqing.lz@taobao.com>
> > > > > 
> > > > > By recording the phycisal block and status, extent status tree is able
> > > > > to track the status of every extents.  When we call _map_blocks
> > > > > functions to lookup an extent or create a new written/unwritten/delayed
> > > > > extent, this extent will be inserted into extent status tree.
> > > > > 
> > > > > We don't load all extents from disk in alloc_inode() because it costs
> > > > > too much memory, and if a file is opened and closed frequently it will
> > > > > takes too much time to load all extent information.  So currently when
> > > > > we create/lookup an extent, this extent will be inserted into extent
> > > > > status tree.  Hence, the extent status tree may not comprehensively
> > > > > contain all of the extents found in the file.
> > > > > 
> > > > > Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> > > > > Cc: "Theodore Ts'o" <tytso@mit.edu>
> > > > > ---
> > > > >  fs/ext4/extents.c |  5 +++-
> > > > >  fs/ext4/file.c    |  6 +++--
> > > > >  fs/ext4/inode.c   | 73 ++++++++++++++++++++++++++++++++++++-------------------
> > > > >  3 files changed, 56 insertions(+), 28 deletions(-)
> > > > > 
> > > > > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> > > > > index aa9a6d2..d23a654 100644
> > > > > --- a/fs/ext4/extents.c
> > > > > +++ b/fs/ext4/extents.c
> > > > > @@ -2074,7 +2074,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
> > > > >  		}
> > > > >  
> > > > >  		/* This is possible iff next == next_del == EXT_MAX_BLOCKS */
> > > > > -		if (next == next_del) {
> > > > > +		if (next == next_del && next_del == EXT_MAX_BLOCKS) {
> > > >   This doesn't seem to be related, does it?
> > > 
> > > ext4_ext_next_allocated_block() will return EXT_MAX_BLOCKS when it
> > > reaches the end of file.  ext4_find_delayed_extent() does the same
> > > thing.
> >   Yes.
> > 
> > > Before tracking written/unwritten extent it is correct because
> > > next never equals to next_del unless both of them equal to
> > > EXT_MAX_BLOCKS.  However, after that next is possible to equal to
> > > next_del when they don't reach the end of file.
  <snip>
> > To our current discussion ext4_find_delayed_extent() either returns 0 or
> > start of next delalloc extent (if we found extent of other type in the
> > status tree we just return 0) so I don't see how next_del == next for other
> > value than EXT_MAX_BLOCKS. But maybe I miss something.
> 
> After tracking all extent status in status tree, ext4_es_find_extent()
> returns not only delayed extent, but also written/unwritten extents.  So
> it is possible that next_del == next and its value is not
> EXT_MAX_BLOCKS.  *But* in latest version ext4_es_find_extent() will be
> changed to only return delayed extent.  So the problem will be fixed.
  Ah, now I see. You added the condition checking whether extent is delayed
only to the newex->ec_start == 0 branch. So if we don't take that branch,
we could have returned an extent which isn't delayed.

IMHO it is a wrong decision for ext4_es_find_extent() to return only
delayed extents. That should really return any extent that contains given
block (or is after it). It is ext4_find_delayed_extent() that should really
be changed to return only delayed extents as its name suggests...

> > > > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > > > index e09c7cf..f0dda2a 100644
> > > > > --- a/fs/ext4/inode.c
> > > > > +++ b/fs/ext4/inode.c
> > > > > @@ -615,18 +615,27 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
> > > > >  			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
> > > > >  			ext4_da_update_reserve_space(inode, retval, 1);
> > > > >  	}
> > > > > -	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
> > > > > +	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
> > > > >  		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
> > > > >  
> > > > > -		if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
> > > > > -			int ret;
> > > > > -delayed_mapped:
> > > > > -			/* delayed allocation blocks has been allocated */
> > > > > -			ret = ext4_es_remove_extent(inode, map->m_lblk,
> > > > > -						    map->m_len);
> > > > > -			if (ret < 0)
> > > > > -				retval = ret;
> > > > > -		}
> > > > > +	if (retval > 0) {
> > > > > +		int ret, status;
> > > > > +
> > > > > +		if (flags & EXT4_GET_BLOCKS_PRE_IO)
> > > > > +			status = EXTENT_STATUS_UNWRITTEN;
> > > > > +		else if (flags & EXT4_GET_BLOCKS_CONVERT)
> > > > > +			status = EXTENT_STATUS_WRITTEN;
> > > > > +		else if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
> > > > > +			status = EXTENT_STATUS_UNWRITTEN;
> > > > > +		else if (flags & EXT4_GET_BLOCKS_CREATE)
> > > > > +			status = EXTENT_STATUS_WRITTEN;
> > > > > +		else
> > > > > +			BUG_ON(1);
> > > > > +
> > > > > +		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
> > > > > +					    map->m_pblk, status);
> > > > > +		if (ret < 0)
> > > > > +			retval = ret;
> > > >   Hum, are you sure the extent status will be correct? Won't it be safer to
> > > > just use whatever we have in 'map'?
> > > 
> > > Your meaning is that we need to ignore the error when we insert a extent
> > > into the extent status tree, right?  But that would causes an
> > > inconsistency between status tree and extent tree.  Further,
> > > ext4_es_insert_extent() returns EINVAL or ENOMEM.  I believe that
> > > reporting an error is a better choice.  What do you think?
> >   No, I meant something else. For example you decide extent at given
> > position is 'UNWRITTEN' just on the basis that someone passed
> > EXT4_GET_BLOCKS_PRE_IO as get_blocks flags. How do you know? Cannot someone
> > pass EXT4_GET_BLOCKS_PRE_IO and we actually find out the extent at given
> > position is fully allocated extent (i.e. WRITTEN) so we do nothing? Then
> > you would cache bad state in the extent tree... That's why I'd rather see
> > we derive current 'status' from 'map' where we are sure to have correct
> > information and don't have to guess it from passed flags.
> 
> First of all, we don't need to worry about this problem because we
> always lookup an extent before trying to create it.  So when it is an
> written extent, we will return from ext4_map_blocks() directly and won't
> try to create it.  So status tree don't be touched.
  So my argument isn't as much about whether you can deduce the correct
result from flags passed to ext4_map_blocks() but rather that it simply
isn't the right place where to look. The right place where to look what
extent is at given position is 'map' where we store what we found. And you
are right that ext4_ext_map_blocks() isn't properly returning
EXT4_MAP_UNWRITTEN in some cases - thanks for noticing that - but then the
right answer is to fix ext4_ext_map_blocks() to return it and not to hack
around that in extent cache code... Believe me it will save us quite some
head scratching later.

> Secondly, as far as I know, ext4_ext_map_blocks() will never return
> EXT4_MAP_UNWRITTEN flags after it tries to allocate an unwritten extent.
> So that means that 'm_flags' in map is incorrect.  Maybe I miss
> something.

							Bye
								Honza
Zheng Liu - Feb. 5, 2013, 1:24 p.m.
On Tue, Feb 05, 2013 at 01:08:54PM +0100, Jan Kara wrote:
[snip]
> > After tracking all extent status in status tree, ext4_es_find_extent()
> > returns not only delayed extent, but also written/unwritten extents.  So
> > it is possible that next_del == next and its value is not
> > EXT_MAX_BLOCKS.  *But* in latest version ext4_es_find_extent() will be
> > changed to only return delayed extent.  So the problem will be fixed.
>   Ah, now I see. You added the condition checking whether extent is delayed
> only to the newex->ec_start == 0 branch. So if we don't take that branch,
> we could have returned an extent which isn't delayed.
> 
> IMHO it is a wrong decision for ext4_es_find_extent() to return only
> delayed extents. That should really return any extent that contains given
> block (or is after it). It is ext4_find_delayed_extent() that should really
> be changed to return only delayed extents as its name suggests...

I revised the patch series and found that ext4_es_find_extent() is
only used to lookup a delayed extent by the following functions:

 - ext4_find_delalloc_range()
 - ext4_find_delayed_extent()
 - ext4_seek_data()
 - ext4_seek_hole()

So IMHO the better decision is to rename it to ext4_es_find_delayed_extent()
and let it only return delayed extent.  In patch 6/9, a new function
called ext4_es_lookup_extent() is defined to do this job that returns an
extent that contains given block.  What do you think?

[snip]
> > > > >   Hum, are you sure the extent status will be correct? Won't it be safer to
> > > > > just use whatever we have in 'map'?
> > > > 
> > > > Your meaning is that we need to ignore the error when we insert a extent
> > > > into the extent status tree, right?  But that would causes an
> > > > inconsistency between status tree and extent tree.  Further,
> > > > ext4_es_insert_extent() returns EINVAL or ENOMEM.  I believe that
> > > > reporting an error is a better choice.  What do you think?
> > >   No, I meant something else. For example you decide extent at given
> > > position is 'UNWRITTEN' just on the basis that someone passed
> > > EXT4_GET_BLOCKS_PRE_IO as get_blocks flags. How do you know? Cannot someone
> > > pass EXT4_GET_BLOCKS_PRE_IO and we actually find out the extent at given
> > > position is fully allocated extent (i.e. WRITTEN) so we do nothing? Then
> > > you would cache bad state in the extent tree... That's why I'd rather see
> > > we derive current 'status' from 'map' where we are sure to have correct
> > > information and don't have to guess it from passed flags.
> > 
> > First of all, we don't need to worry about this problem because we
> > always lookup an extent before trying to create it.  So when it is an
> > written extent, we will return from ext4_map_blocks() directly and won't
> > try to create it.  So status tree don't be touched.
>   So my argument isn't as much about whether you can deduce the correct
> result from flags passed to ext4_map_blocks() but rather that it simply
> isn't the right place where to look. The right place where to look what
> extent is at given position is 'map' where we store what we found. And you
> are right that ext4_ext_map_blocks() isn't properly returning
> EXT4_MAP_UNWRITTEN in some cases - thanks for noticing that - but then the
> right answer is to fix ext4_ext_map_blocks() to return it and not to hack
> around that in extent cache code... Believe me it will save us quite some
> head scratching later.

Fair enough.  I will try to fix it.

Thanks for your suggestion,
                                                - Zheng
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara - Feb. 5, 2013, 1:27 p.m.
On Tue 05-02-13 21:24:33, Zheng Liu wrote:
> On Tue, Feb 05, 2013 at 01:08:54PM +0100, Jan Kara wrote:
> [snip]
> > > After tracking all extent status in status tree, ext4_es_find_extent()
> > > returns not only delayed extent, but also written/unwritten extents.  So
> > > it is possible that next_del == next and its value is not
> > > EXT_MAX_BLOCKS.  *But* in latest version ext4_es_find_extent() will be
> > > changed to only return delayed extent.  So the problem will be fixed.
> >   Ah, now I see. You added the condition checking whether extent is delayed
> > only to the newex->ec_start == 0 branch. So if we don't take that branch,
> > we could have returned an extent which isn't delayed.
> > 
> > IMHO it is a wrong decision for ext4_es_find_extent() to return only
> > delayed extents. That should really return any extent that contains given
> > block (or is after it). It is ext4_find_delayed_extent() that should really
> > be changed to return only delayed extents as its name suggests...
> 
> I revised the patch series and found that ext4_es_find_extent() is
> only used to lookup a delayed extent by the following functions:
> 
>  - ext4_find_delalloc_range()
>  - ext4_find_delayed_extent()
>  - ext4_seek_data()
>  - ext4_seek_hole()
> 
> So IMHO the better decision is to rename it to ext4_es_find_delayed_extent()
> and let it only return delayed extent.  In patch 6/9, a new function
> called ext4_es_lookup_extent() is defined to do this job that returns an
> extent that contains given block.  What do you think?
  Yes, this works for me. Thanks for looking into it.

								Honza

Patch

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index aa9a6d2..d23a654 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2074,7 +2074,7 @@  static int ext4_fill_fiemap_extents(struct inode *inode,
 		}
 
 		/* This is possible iff next == next_del == EXT_MAX_BLOCKS */
-		if (next == next_del) {
+		if (next == next_del && next_del == EXT_MAX_BLOCKS) {
 			flags |= FIEMAP_EXTENT_LAST;
 			if (unlikely(next_del != EXT_MAX_BLOCKS ||
 				     next != EXT_MAX_BLOCKS)) {
@@ -4570,6 +4570,9 @@  static int ext4_find_delayed_extent(struct inode *inode,
 			/* A hole found. */
 			return 0;
 
+		if (!ext4_es_is_delayed(&es))
+			return 0;
+
 		if (es.es_lblk > newex->ec_block) {
 			/* A hole found. */
 			newex->ec_len = min(es.es_lblk - newex->ec_block,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 718c49f..afaf9f15 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -466,7 +466,8 @@  static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 		 */
 		es.es_lblk = last;
 		(void)ext4_es_find_extent(inode, &es);
-		if (last >= es.es_lblk && last < es.es_lblk + es.es_len) {
+		if (ext4_es_is_delayed(&es) &&
+		    last >= es.es_lblk && last < es.es_lblk + es.es_len) {
 			if (last != start)
 				dataoff = last << blkbits;
 			break;
@@ -550,7 +551,8 @@  static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 		 */
 		es.es_lblk = last;
 		(void)ext4_es_find_extent(inode, &es);
-		if (last >= es.es_lblk && last < es.es_lblk + es.es_len) {
+		if (ext4_es_is_delayed(&es) &&
+		    last >= es.es_lblk && last < es.es_lblk + es.es_len) {
 			last = es.es_lblk + es.es_len;
 			holeoff = last << blkbits;
 			continue;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e09c7cf..f0dda2a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -527,20 +527,20 @@  int ext4_map_blocks(handle_t *handle, struct inode *inode,
 		retval = ext4_ind_map_blocks(handle, inode, map, flags &
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
 	}
+	if (retval > 0) {
+		int ret, status;
+		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+		ret = ext4_es_insert_extent(inode, map->m_lblk,
+					    map->m_len, map->m_pblk, status);
+		if (ret < 0)
+			retval = ret;
+	}
 	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
 		up_read((&EXT4_I(inode)->i_data_sem));
 
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-		int ret;
-		if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
-			/* delayed alloc may be allocated by fallocate and
-			 * coverted to initialized by directIO.
-			 * we need to handle delayed extent here.
-			 */
-			down_write((&EXT4_I(inode)->i_data_sem));
-			goto delayed_mapped;
-		}
-		ret = check_block_validity(inode, map);
+		int ret = check_block_validity(inode, map);
 		if (ret != 0)
 			return ret;
 	}
@@ -615,18 +615,27 @@  int ext4_map_blocks(handle_t *handle, struct inode *inode,
 			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
 			ext4_da_update_reserve_space(inode, retval, 1);
 	}
-	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 
-		if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-			int ret;
-delayed_mapped:
-			/* delayed allocation blocks has been allocated */
-			ret = ext4_es_remove_extent(inode, map->m_lblk,
-						    map->m_len);
-			if (ret < 0)
-				retval = ret;
-		}
+	if (retval > 0) {
+		int ret, status;
+
+		if (flags & EXT4_GET_BLOCKS_PRE_IO)
+			status = EXTENT_STATUS_UNWRITTEN;
+		else if (flags & EXT4_GET_BLOCKS_CONVERT)
+			status = EXTENT_STATUS_WRITTEN;
+		else if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
+			status = EXTENT_STATUS_UNWRITTEN;
+		else if (flags & EXT4_GET_BLOCKS_CREATE)
+			status = EXTENT_STATUS_WRITTEN;
+		else
+			BUG_ON(1);
+
+		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+					    map->m_pblk, status);
+		if (ret < 0)
+			retval = ret;
 	}
 
 	up_write((&EXT4_I(inode)->i_data_sem));
@@ -1805,6 +1814,7 @@  static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 		retval = ext4_ind_map_blocks(NULL, inode, map, 0);
 
 	if (retval == 0) {
+		int ret;
 		/*
 		 * XXX: __block_prepare_write() unmaps passed block,
 		 * is it OK?
@@ -1813,20 +1823,33 @@  static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 		 * then we dont need to reserve it again. */
 		if ((EXT4_SB(inode->i_sb)->s_cluster_ratio == 1) ||
 		    !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
-			retval = ext4_da_reserve_space(inode, iblock);
-			if (retval)
+			ret = ext4_da_reserve_space(inode, iblock);
+			if (ret) {
 				/* not enough space to reserve */
+				retval = ret;
 				goto out_unlock;
+			}
 		}
 
-		retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
-					       ~0, EXTENT_STATUS_DELAYED);
-		if (retval)
+		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+					    ~0, EXTENT_STATUS_DELAYED);
+		if (ret) {
+			retval = ret;
 			goto out_unlock;
+		}
 
 		map_bh(bh, inode->i_sb, invalid_block);
 		set_buffer_new(bh);
 		set_buffer_delay(bh);
+	} else if (retval > 0) {
+		int ret, status;
+
+		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+					    map->m_pblk, status);
+		if (ret != 0)
+			retval = ret;
 	}
 
 out_unlock: