Patchwork ext4: restart ext4_ext_remove_space() after transaction restart

login
register
mail settings
Submitter Dmitri Monakho
Date April 22, 2010, 4:31 a.m.
Message ID <1271910671-16627-1-git-send-email-dmonakhov@openvz.org>
Download mbox | patch
Permalink /patch/50687/
State Accepted, archived
Delegated to: Theodore Ts'o
Headers show

Comments

Dmitri Monakho - April 22, 2010, 4:31 a.m.
If i_data_sem was internally dropped due to transaction restart, it is
necessary to restart path look-up because extents tree was possibly
modified by ext4_get_block().

https://bugzilla.kernel.org/show_bug.cgi?id=15827

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/ext4.h    |    1 +
 fs/ext4/extents.c |   18 +++++++++++++-----
 2 files changed, 14 insertions(+), 5 deletions(-)
Dmitri Monakho - April 22, 2010, 7:33 a.m.
Dmitry Monakhov <dmonakhov@openvz.org> writes:

Oops. i've missed last 'u' in 'edu', the Ted's email.
Add him to cc.
BTW seems that there is only one reproducible bug is left after
227's xfstest case. The bug with wrong i_blocks count:
Pass 1: Checking inodes, blocks, and sizes
Inode 3855, i_blocks is 288, should be 304.  Fix? no
This will be the hardest one because i'm able to discover wrong blocks
count only on fsck stage. I'm suspecting what quota charge is missed
somewhere for metablocks. Continue digging. 
> If i_data_sem was internally dropped due to transaction restart, it is
> necessary to restart path look-up because extents tree was possibly
> modified by ext4_get_block().
>
> https://bugzilla.kernel.org/show_bug.cgi?id=15827
>
> Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
> ---
>  fs/ext4/ext4.h    |    1 +
>  fs/ext4/extents.c |   18 +++++++++++++-----
>  2 files changed, 14 insertions(+), 5 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 6641c58..c69efb2 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1089,6 +1089,7 @@ enum {
>  	EXT4_STATE_DA_ALLOC_CLOSE,	/* Alloc DA blks on close */
>  	EXT4_STATE_EXT_MIGRATE,		/* Inode is migrating */
>  	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
> +	EXT4_STATE_EXT_TRUNC,		/* truncate is in progress, modified under i_data_sem */
>  };
>  
>  #define EXT4_INODE_BIT_FNS(name, field)					\
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 4fa103c..6856272 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
>  	if (err <= 0)
>  		return err;
>  	err = ext4_truncate_restart_trans(handle, inode, needed);
> -	/*
> -	 * We have dropped i_data_sem so someone might have cached again
> -	 * an extent we are going to truncate.
> -	 */
> -	ext4_ext_invalidate_cache(inode);
> +	if (!err && !ext4_test_inode_state(inode, EXT4_STATE_EXT_TRUNC))
> +		err = -EAGAIN;
>  
>  	return err;
>  }
> @@ -2370,12 +2367,15 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
>  	if (IS_ERR(handle))
>  		return PTR_ERR(handle);
>  
> +again:
>  	ext4_ext_invalidate_cache(inode);
>  
>  	/*
>  	 * We start scanning from right side, freeing all the blocks
>  	 * after i_size and walking into the tree depth-wise.
>  	 */
> +	ext4_set_inode_state(inode, EXT4_STATE_EXT_TRUNC);
> +	depth = ext_depth(inode);
>  	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
>  	if (path == NULL) {
>  		ext4_journal_stop(handle);
> @@ -2480,6 +2480,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
>  out:
>  	ext4_ext_drop_refs(path);
>  	kfree(path);
> +	if (err == EAGAIN) {
> +		err = 0;
> +		goto again;
> +	}
> +	ext4_clear_inode_state(inode, EXT4_STATE_EXT_TRUNC);
>  	ext4_journal_stop(handle);
>  
>  	return err;
> @@ -3338,6 +3343,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
>  	ext_debug("blocks %u/%u requested for inode %lu\n",
>  			iblock, max_blocks, inode->i_ino);
>  
> +	if (unlikely((flags & EXT4_GET_BLOCKS_CREATE)) &&
> +		ext4_test_inode_state(inode, EXT4_STATE_EXT_TRUNC))
> +		ext4_clear_inode_state(inode, EXT4_STATE_EXT_TRUNC);
>  	/* check in cache */
>  	cache_type = ext4_ext_in_cache(inode, iblock, &newex);
>  	if (cache_type) {

>  LocalWords:  xfstest
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara - April 26, 2010, 4:09 p.m.
On Thu 22-04-10 08:31:11, Dmitry Monakhov wrote:
> If i_data_sem was internally dropped due to transaction restart, it is
> necessary to restart path look-up because extents tree was possibly
> modified by ext4_get_block().
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=15827
  Nice spotting. The patch looks good to me.

Acked-by: Jan Kara <jack@suse.cz>

								Honza
> 
> Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
> ---
>  fs/ext4/ext4.h    |    1 +
>  fs/ext4/extents.c |   18 +++++++++++++-----
>  2 files changed, 14 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 6641c58..c69efb2 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1089,6 +1089,7 @@ enum {
>  	EXT4_STATE_DA_ALLOC_CLOSE,	/* Alloc DA blks on close */
>  	EXT4_STATE_EXT_MIGRATE,		/* Inode is migrating */
>  	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
> +	EXT4_STATE_EXT_TRUNC,		/* truncate is in progress, modified under i_data_sem */
>  };
>  
>  #define EXT4_INODE_BIT_FNS(name, field)					\
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 4fa103c..6856272 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
>  	if (err <= 0)
>  		return err;
>  	err = ext4_truncate_restart_trans(handle, inode, needed);
> -	/*
> -	 * We have dropped i_data_sem so someone might have cached again
> -	 * an extent we are going to truncate.
> -	 */
> -	ext4_ext_invalidate_cache(inode);
> +	if (!err && !ext4_test_inode_state(inode, EXT4_STATE_EXT_TRUNC))
> +		err = -EAGAIN;
>  
>  	return err;
>  }
> @@ -2370,12 +2367,15 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
>  	if (IS_ERR(handle))
>  		return PTR_ERR(handle);
>  
> +again:
>  	ext4_ext_invalidate_cache(inode);
>  
>  	/*
>  	 * We start scanning from right side, freeing all the blocks
>  	 * after i_size and walking into the tree depth-wise.
>  	 */
> +	ext4_set_inode_state(inode, EXT4_STATE_EXT_TRUNC);
> +	depth = ext_depth(inode);
>  	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
>  	if (path == NULL) {
>  		ext4_journal_stop(handle);
> @@ -2480,6 +2480,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
>  out:
>  	ext4_ext_drop_refs(path);
>  	kfree(path);
> +	if (err == EAGAIN) {
> +		err = 0;
> +		goto again;
> +	}
> +	ext4_clear_inode_state(inode, EXT4_STATE_EXT_TRUNC);
>  	ext4_journal_stop(handle);
>  
>  	return err;
> @@ -3338,6 +3343,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
>  	ext_debug("blocks %u/%u requested for inode %lu\n",
>  			iblock, max_blocks, inode->i_ino);
>  
> +	if (unlikely((flags & EXT4_GET_BLOCKS_CREATE)) &&
> +		ext4_test_inode_state(inode, EXT4_STATE_EXT_TRUNC))
> +		ext4_clear_inode_state(inode, EXT4_STATE_EXT_TRUNC);
>  	/* check in cache */
>  	cache_type = ext4_ext_in_cache(inode, iblock, &newex);
>  	if (cache_type) {
> -- 
> 1.6.6.1
>
Theodore Ts'o - May 25, 2010, 1:32 p.m.
On Thu, Apr 22, 2010 at 08:31:11AM +0400, Dmitry Monakhov wrote:
> @@ -2480,6 +2480,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
>  out:
>  	ext4_ext_drop_refs(path);
>  	kfree(path);
> +	if (err == EAGAIN) {

Surely this should be "err == -EAGAIN", no?  I'm curious how this
patch worked for with this typo....

> +		err = 0;
> +		goto again;
> +	}
> +	ext4_clear_inode_state(inode, EXT4_STATE_EXT_TRUNC);
>  	ext4_journal_stop(handle);
>  
>  	return err;

					- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o - May 25, 2010, 1:55 p.m.
On Thu, Apr 22, 2010 at 08:31:11AM +0400, Dmitry Monakhov wrote:
> If i_data_sem was internally dropped due to transaction restart, it is
> necessary to restart path look-up because extents tree was possibly
> modified by ext4_get_block().
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=15827
> 
> Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>

I *think* it would be more efficient to add the additional change:

  	  if (ext4_ext_more_to_rm(path + i)) {

to:

  	  if ((err != -EAGAIN) && ext4_ext_more_to_rm(path + i)) {

in ext4_ext_remove_space() but I would like your opinion...

If we do this optimization I'll probably do it in a separate patch.
It just seems that we're doing a lot of extra work once we fail to
extend the transaction, so it would be good to optimize more of this
out.  It also makes it easier to convince oneself that that all of the
spinning around that happens after returning EAGAIN won't cause any
harm.  After going through the patch carefully I'm pretty sure the
extra work is pointless, but not harmful, but it's better to skip it
entirely if it's not needed.

					 - Ted

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dmitri Monakho - May 25, 2010, 2:28 p.m.
tytso@mit.edu writes:

> On Thu, Apr 22, 2010 at 08:31:11AM +0400, Dmitry Monakhov wrote:
>> @@ -2480,6 +2480,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
>>  out:
>>  	ext4_ext_drop_refs(path);
>>  	kfree(path);
>> +	if (err == EAGAIN) {
>
> Surely this should be "err == -EAGAIN", no?  I'm curious how this
> patch worked for with this typo....
As usually it fix one thing, and broke another :(.
So in case of alloc/truncate restart truncate will be aborted,
so i_size != i_disk_size which must be caught by fsck (my test run
it every time) but this never happens which is very strange.
The only reason i can explain this that truncate was called second
time which is probable due to should_retry_alloc logic.
Even more than this, i've changed the mistypo and have got massive
complain from fsck
Block bitmap differences:  -7954 -(33836--33854)
Fix<y>? yes

Free blocks count wrong for group #0 (24170, counted=24171).
Fix<y>? yes
...
Currently i'm digging the issue.

>
>> +		err = 0;
>> +		goto again;
>> +	}
>> +	ext4_clear_inode_state(inode, EXT4_STATE_EXT_TRUNC);
>>  	ext4_journal_stop(handle);
>>  
>>  	return err;
>
> 					- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o - May 25, 2010, 9:38 p.m.
On Tue, May 25, 2010 at 06:28:29PM +0400, Dmitry Monakhov wrote:
> tytso@mit.edu writes:
> 
> > On Thu, Apr 22, 2010 at 08:31:11AM +0400, Dmitry Monakhov wrote:
> >> @@ -2480,6 +2480,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
> >>  out:
> >>  	ext4_ext_drop_refs(path);
> >>  	kfree(path);
> >> +	if (err == EAGAIN) {
> >
> > Surely this should be "err == -EAGAIN", no?  I'm curious how this
> > patch worked for with this typo....
> As usually it fix one thing, and broke another :(.
> So in case of alloc/truncate restart truncate will be aborted,
> so i_size != i_disk_size which must be caught by fsck (my test run
> it every time) but this never happens which is very strange.
> The only reason i can explain this that truncate was called second
> time which is probable due to should_retry_alloc logic.

Does adding the optimization I suggested help?  I was nervous because
we don't immediately abort the loop after the rm_leaf function returns
-EAGAIN.  And disentangling the code to free the buffer references
from the other processing that was happening was difficult, and I was
worried about other potential side effects when the code tried to
modify blocks that were already added to the transaction.

       	      	   		      	     - Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o - May 25, 2010, 9:44 p.m.
On Tue, May 25, 2010 at 06:28:29PM +0400, Dmitry Monakhov wrote:
> tytso@mit.edu writes:
> 
> > On Thu, Apr 22, 2010 at 08:31:11AM +0400, Dmitry Monakhov wrote:
> >> @@ -2480,6 +2480,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
> >>  out:
> >>  	ext4_ext_drop_refs(path);
> >>  	kfree(path);
> >> +	if (err == EAGAIN) {
> >
> > Surely this should be "err == -EAGAIN", no?  I'm curious how this
> > patch worked for with this typo....
> As usually it fix one thing, and broke another :(.
> So in case of alloc/truncate restart truncate will be aborted,
> so i_size != i_disk_size which must be caught by fsck (my test run
> it every time) but this never happens which is very strange.

What test case are you using?  And does it require a system crash to
show up, or are you seeing an fsck problem after the test completes
and you unmount the file system?

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dmitri Monakho - May 26, 2010, 8:53 a.m.
tytso@mit.edu writes:

> On Tue, May 25, 2010 at 06:28:29PM +0400, Dmitry Monakhov wrote:
>> tytso@mit.edu writes:
>> 
>> > On Thu, Apr 22, 2010 at 08:31:11AM +0400, Dmitry Monakhov wrote:
>> >> @@ -2480,6 +2480,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
>> >>  out:
>> >>  	ext4_ext_drop_refs(path);
>> >>  	kfree(path);
>> >> +	if (err == EAGAIN) {
>> >
>> > Surely this should be "err == -EAGAIN", no?  I'm curious how this
>> > patch worked for with this typo....
>> As usually it fix one thing, and broke another :(.
>> So in case of alloc/truncate restart truncate will be aborted,
>> so i_size != i_disk_size which must be caught by fsck (my test run
>> it every time) but this never happens which is very strange.
>> The only reason i can explain this that truncate was called second
>> time which is probable due to should_retry_alloc logic.
>
> Does adding the optimization I suggested help?  I was nervous because
> we don't immediately abort the loop after the rm_leaf function returns
> -EAGAIN.
Sorry, but seems i don't get your idea.
we have following code:
2394: while (i >= 0 && err == 0) {
                if (i == depth) {
                        /* this is leaf block */
                        err = ext4_ext_rm_leaf(handle, inode, path,
                        start);
                        /* root level has p_bh == NULL, brelse() eats this */
                        brelse(path[i].p_bh);
                        path[i].p_bh = NULL;
                        i--;
                        continue;
                       ^^^^^^^^^^^^^^^^^
<<< So if rm_leaf has failed we will quit from while loop
                }

>And disentangling the code to free the buffer references
> from the other processing that was happening was difficult, and I was
> worried about other potential side effects when the code tried to
> modify blocks that were already added to the transaction.
>
>        	      	   		      	     - Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6641c58..c69efb2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1089,6 +1089,7 @@  enum {
 	EXT4_STATE_DA_ALLOC_CLOSE,	/* Alloc DA blks on close */
 	EXT4_STATE_EXT_MIGRATE,		/* Inode is migrating */
 	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
+	EXT4_STATE_EXT_TRUNC,		/* truncate is in progress, modified under i_data_sem */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field)					\
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4fa103c..6856272 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -107,11 +107,8 @@  static int ext4_ext_truncate_extend_restart(handle_t *handle,
 	if (err <= 0)
 		return err;
 	err = ext4_truncate_restart_trans(handle, inode, needed);
-	/*
-	 * We have dropped i_data_sem so someone might have cached again
-	 * an extent we are going to truncate.
-	 */
-	ext4_ext_invalidate_cache(inode);
+	if (!err && !ext4_test_inode_state(inode, EXT4_STATE_EXT_TRUNC))
+		err = -EAGAIN;
 
 	return err;
 }
@@ -2370,12 +2367,15 @@  static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
+again:
 	ext4_ext_invalidate_cache(inode);
 
 	/*
 	 * We start scanning from right side, freeing all the blocks
 	 * after i_size and walking into the tree depth-wise.
 	 */
+	ext4_set_inode_state(inode, EXT4_STATE_EXT_TRUNC);
+	depth = ext_depth(inode);
 	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
 	if (path == NULL) {
 		ext4_journal_stop(handle);
@@ -2480,6 +2480,11 @@  static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 out:
 	ext4_ext_drop_refs(path);
 	kfree(path);
+	if (err == EAGAIN) {
+		err = 0;
+		goto again;
+	}
+	ext4_clear_inode_state(inode, EXT4_STATE_EXT_TRUNC);
 	ext4_journal_stop(handle);
 
 	return err;
@@ -3338,6 +3343,9 @@  int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	ext_debug("blocks %u/%u requested for inode %lu\n",
 			iblock, max_blocks, inode->i_ino);
 
+	if (unlikely((flags & EXT4_GET_BLOCKS_CREATE)) &&
+		ext4_test_inode_state(inode, EXT4_STATE_EXT_TRUNC))
+		ext4_clear_inode_state(inode, EXT4_STATE_EXT_TRUNC);
 	/* check in cache */
 	cache_type = ext4_ext_in_cache(inode, iblock, &newex);
 	if (cache_type) {