Message ID | 1271910671-16627-1-git-send-email-dmonakhov@openvz.org |
---|---|
State | Accepted, archived |
Delegated to: | Theodore Ts'o |
Headers | show |
Dmitry Monakhov <dmonakhov@openvz.org> writes: Oops. i've missed last 'u' in 'edu', the Ted's email. Add him to cc. BTW seems that there is only one reproducible bug is left after 227's xfstest case. The bug with wrong i_blocks count: Pass 1: Checking inodes, blocks, and sizes Inode 3855, i_blocks is 288, should be 304. Fix? no This will be the hardest one because i'm able to discover wrong blocks count only on fsck stage. I'm suspecting what quota charge is missed somewhere for metablocks. Continue digging. > If i_data_sem was internally dropped due to transaction restart, it is > necessary to restart path look-up because extents tree was possibly > modified by ext4_get_block(). > > https://bugzilla.kernel.org/show_bug.cgi?id=15827 > > Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> > --- > fs/ext4/ext4.h | 1 + > fs/ext4/extents.c | 18 +++++++++++++----- > 2 files changed, 14 insertions(+), 5 deletions(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 6641c58..c69efb2 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -1089,6 +1089,7 @@ enum { > EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ > EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ > EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ > + EXT4_STATE_EXT_TRUNC, /* truncate is in progress, modified under i_data_sem */ > }; > > #define EXT4_INODE_BIT_FNS(name, field) \ > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c > index 4fa103c..6856272 100644 > --- a/fs/ext4/extents.c > +++ b/fs/ext4/extents.c > @@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle, > if (err <= 0) > return err; > err = ext4_truncate_restart_trans(handle, inode, needed); > - /* > - * We have dropped i_data_sem so someone might have cached again > - * an extent we are going to truncate. > - */ > - ext4_ext_invalidate_cache(inode); > + if (!err && !ext4_test_inode_state(inode, EXT4_STATE_EXT_TRUNC)) > + err = -EAGAIN; > > return err; > } > @@ -2370,12 +2367,15 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) > if (IS_ERR(handle)) > return PTR_ERR(handle); > > +again: > ext4_ext_invalidate_cache(inode); > > /* > * We start scanning from right side, freeing all the blocks > * after i_size and walking into the tree depth-wise. > */ > + ext4_set_inode_state(inode, EXT4_STATE_EXT_TRUNC); > + depth = ext_depth(inode); > path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); > if (path == NULL) { > ext4_journal_stop(handle); > @@ -2480,6 +2480,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) > out: > ext4_ext_drop_refs(path); > kfree(path); > + if (err == EAGAIN) { > + err = 0; > + goto again; > + } > + ext4_clear_inode_state(inode, EXT4_STATE_EXT_TRUNC); > ext4_journal_stop(handle); > > return err; > @@ -3338,6 +3343,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, > ext_debug("blocks %u/%u requested for inode %lu\n", > iblock, max_blocks, inode->i_ino); > > + if (unlikely((flags & EXT4_GET_BLOCKS_CREATE)) && > + ext4_test_inode_state(inode, EXT4_STATE_EXT_TRUNC)) > + ext4_clear_inode_state(inode, EXT4_STATE_EXT_TRUNC); > /* check in cache */ > cache_type = ext4_ext_in_cache(inode, iblock, &newex); > if (cache_type) { > LocalWords: xfstest -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu 22-04-10 08:31:11, Dmitry Monakhov wrote: > If i_data_sem was internally dropped due to transaction restart, it is > necessary to restart path look-up because extents tree was possibly > modified by ext4_get_block(). > > https://bugzilla.kernel.org/show_bug.cgi?id=15827 Nice spotting. The patch looks good to me. Acked-by: Jan Kara <jack@suse.cz> Honza > > Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> > --- > fs/ext4/ext4.h | 1 + > fs/ext4/extents.c | 18 +++++++++++++----- > 2 files changed, 14 insertions(+), 5 deletions(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 6641c58..c69efb2 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -1089,6 +1089,7 @@ enum { > EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ > EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ > EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ > + EXT4_STATE_EXT_TRUNC, /* truncate is in progress, modified under i_data_sem */ > }; > > #define EXT4_INODE_BIT_FNS(name, field) \ > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c > index 4fa103c..6856272 100644 > --- a/fs/ext4/extents.c > +++ b/fs/ext4/extents.c > @@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle, > if (err <= 0) > return err; > err = ext4_truncate_restart_trans(handle, inode, needed); > - /* > - * We have dropped i_data_sem so someone might have cached again > - * an extent we are going to truncate. > - */ > - ext4_ext_invalidate_cache(inode); > + if (!err && !ext4_test_inode_state(inode, EXT4_STATE_EXT_TRUNC)) > + err = -EAGAIN; > > return err; > } > @@ -2370,12 +2367,15 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) > if (IS_ERR(handle)) > return PTR_ERR(handle); > > +again: > ext4_ext_invalidate_cache(inode); > > /* > * We start scanning from right side, freeing all the blocks > * after i_size and walking into the tree depth-wise. > */ > + ext4_set_inode_state(inode, EXT4_STATE_EXT_TRUNC); > + depth = ext_depth(inode); > path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); > if (path == NULL) { > ext4_journal_stop(handle); > @@ -2480,6 +2480,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) > out: > ext4_ext_drop_refs(path); > kfree(path); > + if (err == EAGAIN) { > + err = 0; > + goto again; > + } > + ext4_clear_inode_state(inode, EXT4_STATE_EXT_TRUNC); > ext4_journal_stop(handle); > > return err; > @@ -3338,6 +3343,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, > ext_debug("blocks %u/%u requested for inode %lu\n", > iblock, max_blocks, inode->i_ino); > > + if (unlikely((flags & EXT4_GET_BLOCKS_CREATE)) && > + ext4_test_inode_state(inode, EXT4_STATE_EXT_TRUNC)) > + ext4_clear_inode_state(inode, EXT4_STATE_EXT_TRUNC); > /* check in cache */ > cache_type = ext4_ext_in_cache(inode, iblock, &newex); > if (cache_type) { > -- > 1.6.6.1 >
On Thu, Apr 22, 2010 at 08:31:11AM +0400, Dmitry Monakhov wrote: > @@ -2480,6 +2480,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) > out: > ext4_ext_drop_refs(path); > kfree(path); > + if (err == EAGAIN) { Surely this should be "err == -EAGAIN", no? I'm curious how this patch worked for with this typo.... > + err = 0; > + goto again; > + } > + ext4_clear_inode_state(inode, EXT4_STATE_EXT_TRUNC); > ext4_journal_stop(handle); > > return err; - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Apr 22, 2010 at 08:31:11AM +0400, Dmitry Monakhov wrote: > If i_data_sem was internally dropped due to transaction restart, it is > necessary to restart path look-up because extents tree was possibly > modified by ext4_get_block(). > > https://bugzilla.kernel.org/show_bug.cgi?id=15827 > > Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> I *think* it would be more efficient to add the additional change: if (ext4_ext_more_to_rm(path + i)) { to: if ((err != -EAGAIN) && ext4_ext_more_to_rm(path + i)) { in ext4_ext_remove_space() but I would like your opinion... If we do this optimization I'll probably do it in a separate patch. It just seems that we're doing a lot of extra work once we fail to extend the transaction, so it would be good to optimize more of this out. It also makes it easier to convince oneself that that all of the spinning around that happens after returning EAGAIN won't cause any harm. After going through the patch carefully I'm pretty sure the extra work is pointless, but not harmful, but it's better to skip it entirely if it's not needed. - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
tytso@mit.edu writes: > On Thu, Apr 22, 2010 at 08:31:11AM +0400, Dmitry Monakhov wrote: >> @@ -2480,6 +2480,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) >> out: >> ext4_ext_drop_refs(path); >> kfree(path); >> + if (err == EAGAIN) { > > Surely this should be "err == -EAGAIN", no? I'm curious how this > patch worked for with this typo.... As usually it fix one thing, and broke another :(. So in case of alloc/truncate restart truncate will be aborted, so i_size != i_disk_size which must be caught by fsck (my test run it every time) but this never happens which is very strange. The only reason i can explain this that truncate was called second time which is probable due to should_retry_alloc logic. Even more than this, i've changed the mistypo and have got massive complain from fsck Block bitmap differences: -7954 -(33836--33854) Fix<y>? yes Free blocks count wrong for group #0 (24170, counted=24171). Fix<y>? yes ... Currently i'm digging the issue. > >> + err = 0; >> + goto again; >> + } >> + ext4_clear_inode_state(inode, EXT4_STATE_EXT_TRUNC); >> ext4_journal_stop(handle); >> >> return err; > > - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 25, 2010 at 06:28:29PM +0400, Dmitry Monakhov wrote: > tytso@mit.edu writes: > > > On Thu, Apr 22, 2010 at 08:31:11AM +0400, Dmitry Monakhov wrote: > >> @@ -2480,6 +2480,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) > >> out: > >> ext4_ext_drop_refs(path); > >> kfree(path); > >> + if (err == EAGAIN) { > > > > Surely this should be "err == -EAGAIN", no? I'm curious how this > > patch worked for with this typo.... > As usually it fix one thing, and broke another :(. > So in case of alloc/truncate restart truncate will be aborted, > so i_size != i_disk_size which must be caught by fsck (my test run > it every time) but this never happens which is very strange. > The only reason i can explain this that truncate was called second > time which is probable due to should_retry_alloc logic. Does adding the optimization I suggested help? I was nervous because we don't immediately abort the loop after the rm_leaf function returns -EAGAIN. And disentangling the code to free the buffer references from the other processing that was happening was difficult, and I was worried about other potential side effects when the code tried to modify blocks that were already added to the transaction. - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 25, 2010 at 06:28:29PM +0400, Dmitry Monakhov wrote: > tytso@mit.edu writes: > > > On Thu, Apr 22, 2010 at 08:31:11AM +0400, Dmitry Monakhov wrote: > >> @@ -2480,6 +2480,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) > >> out: > >> ext4_ext_drop_refs(path); > >> kfree(path); > >> + if (err == EAGAIN) { > > > > Surely this should be "err == -EAGAIN", no? I'm curious how this > > patch worked for with this typo.... > As usually it fix one thing, and broke another :(. > So in case of alloc/truncate restart truncate will be aborted, > so i_size != i_disk_size which must be caught by fsck (my test run > it every time) but this never happens which is very strange. What test case are you using? And does it require a system crash to show up, or are you seeing an fsck problem after the test completes and you unmount the file system? - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
tytso@mit.edu writes: > On Tue, May 25, 2010 at 06:28:29PM +0400, Dmitry Monakhov wrote: >> tytso@mit.edu writes: >> >> > On Thu, Apr 22, 2010 at 08:31:11AM +0400, Dmitry Monakhov wrote: >> >> @@ -2480,6 +2480,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) >> >> out: >> >> ext4_ext_drop_refs(path); >> >> kfree(path); >> >> + if (err == EAGAIN) { >> > >> > Surely this should be "err == -EAGAIN", no? I'm curious how this >> > patch worked for with this typo.... >> As usually it fix one thing, and broke another :(. >> So in case of alloc/truncate restart truncate will be aborted, >> so i_size != i_disk_size which must be caught by fsck (my test run >> it every time) but this never happens which is very strange. >> The only reason i can explain this that truncate was called second >> time which is probable due to should_retry_alloc logic. > > Does adding the optimization I suggested help? I was nervous because > we don't immediately abort the loop after the rm_leaf function returns > -EAGAIN. Sorry, but seems i don't get your idea. we have following code: 2394: while (i >= 0 && err == 0) { if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, start); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; i--; continue; ^^^^^^^^^^^^^^^^^ <<< So if rm_leaf has failed we will quit from while loop } >And disentangling the code to free the buffer references > from the other processing that was happening was difficult, and I was > worried about other potential side effects when the code tried to > modify blocks that were already added to the transaction. > > - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6641c58..c69efb2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1089,6 +1089,7 @@ enum { EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ + EXT4_STATE_EXT_TRUNC, /* truncate is in progress, modified under i_data_sem */ }; #define EXT4_INODE_BIT_FNS(name, field) \ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4fa103c..6856272 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle, if (err <= 0) return err; err = ext4_truncate_restart_trans(handle, inode, needed); - /* - * We have dropped i_data_sem so someone might have cached again - * an extent we are going to truncate. - */ - ext4_ext_invalidate_cache(inode); + if (!err && !ext4_test_inode_state(inode, EXT4_STATE_EXT_TRUNC)) + err = -EAGAIN; return err; } @@ -2370,12 +2367,15 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) if (IS_ERR(handle)) return PTR_ERR(handle); +again: ext4_ext_invalidate_cache(inode); /* * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. */ + ext4_set_inode_state(inode, EXT4_STATE_EXT_TRUNC); + depth = ext_depth(inode); path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); if (path == NULL) { ext4_journal_stop(handle); @@ -2480,6 +2480,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) out: ext4_ext_drop_refs(path); kfree(path); + if (err == EAGAIN) { + err = 0; + goto again; + } + ext4_clear_inode_state(inode, EXT4_STATE_EXT_TRUNC); ext4_journal_stop(handle); return err; @@ -3338,6 +3343,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext_debug("blocks %u/%u requested for inode %lu\n", iblock, max_blocks, inode->i_ino); + if (unlikely((flags & EXT4_GET_BLOCKS_CREATE)) && + ext4_test_inode_state(inode, EXT4_STATE_EXT_TRUNC)) + ext4_clear_inode_state(inode, EXT4_STATE_EXT_TRUNC); /* check in cache */ cache_type = ext4_ext_in_cache(inode, iblock, &newex); if (cache_type) {
If i_data_sem was internally dropped due to transaction restart, it is necessary to restart path look-up because extents tree was possibly modified by ext4_get_block(). https://bugzilla.kernel.org/show_bug.cgi?id=15827 Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-)