| Message ID | 20260511121356.241821-18-jack@suse.cz |
|---|---|
| State | Not Applicable |
| Headers | show |
| Series | fs: Fix missed inode write during fsync | expand |
On Mon, May 11, 2026 at 02:13:59PM +0200, Jan Kara wrote: > Use mmb inode buffer writeout infrastructure to reliably write out > inode's inode table block on fsync(2) in nojournal mode (from > ext4_sync_parent() and ext4_fsync_nojournal()). This significantly > simplifies the code as we don't have to explicitely handle inode buffer > writeback in ext4_write_inode() and thus we can also remove > sync_inode_metadata() calls from ext4_sync_parent() and > ext4_write_inode() call from ext4_fsync_nojournal(). > > Signed-off-by: Jan Kara <jack@suse.cz> > --- > fs/ext4/ext4_jbd2.c | 2 +- > fs/ext4/ext4_jbd2.h | 2 ++ > fs/ext4/fsync.c | 12 ------------ > fs/ext4/inode.c | 24 +++++------------------- > 4 files changed, 8 insertions(+), 32 deletions(-) > > diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c > index 74f05bd0cdde..6bbaf72108fd 100644 > --- a/fs/ext4/ext4_jbd2.c > +++ b/fs/ext4/ext4_jbd2.c > @@ -350,7 +350,7 @@ int __ext4_journal_get_create_access(const char *where, unsigned int line, > return 0; > } > > -static void ext4_inode_attach_mmb(struct inode *inode) > +void ext4_inode_attach_mmb(struct inode *inode) > { > struct mapping_metadata_bhs *mmb; > > diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h > index 63d17c5201b5..2a01b8279c88 100644 > --- a/fs/ext4/ext4_jbd2.h > +++ b/fs/ext4/ext4_jbd2.h > @@ -122,6 +122,8 @@ > #define EXT4_HT_EXT_CONVERT 11 > #define EXT4_HT_MAX 12 > > +void ext4_inode_attach_mmb(struct inode *inode); > + > int > ext4_mark_iloc_dirty(handle_t *handle, > struct inode *inode, > diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c > index e25d365e1179..af84489e57c6 100644 > --- a/fs/ext4/fsync.c > +++ b/fs/ext4/fsync.c > @@ -75,9 +75,6 @@ static int ext4_sync_parent(struct inode *inode) > if (ret) > break; > } > - ret = sync_inode_metadata(inode, 1); > - if (ret) > - break; > } > dput(dentry); > return ret; > @@ -87,10 +84,6 @@ static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end, > int datasync, bool *needs_barrier) > { > struct inode *inode = file->f_inode; > - struct writeback_control wbc = { > - .sync_mode = WB_SYNC_ALL, > - .nr_to_write = 0, > - }; > int ret; > > ret = mmb_fsync_noflush(file, EXT4_I(inode)->i_metadata_bhs, > @@ -98,11 +91,6 @@ static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end, > if (ret) > return ret; > > - /* Force writeout of inode table buffer to disk */ > - ret = ext4_write_inode(inode, &wbc); > - if (ret) > - return ret; > - > ret = ext4_sync_parent(inode); > > if (test_opt(inode->i_sb, BARRIER)) > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index 3e66e9510909..09506b4de1b2 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -5786,24 +5786,6 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) > > err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, > EXT4_I(inode)->i_sync_tid); > - } else { > - struct ext4_iloc iloc; > - > - err = __ext4_get_inode_loc_noinmem(inode, &iloc); > - if (err) > - return err; > - /* > - * sync(2) will flush the whole buffer cache. No need to do > - * it here separately for each inode. > - */ > - if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) > - sync_dirty_buffer(iloc.bh); > - if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { > - ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO, > - "IO error syncing inode"); > - err = -EIO; > - } > - brelse(iloc.bh); > } > return err; > } > @@ -6348,7 +6330,11 @@ int ext4_mark_iloc_dirty(handle_t *handle, > > /* the do_update_inode consumes one bh->b_count */ > get_bh(iloc->bh); > - > + if (!ext4_handle_valid(handle)) { > + if (!EXT4_I(inode)->i_metadata_bhs) > + ext4_inode_attach_mmb(inode); > + EXT4_I(inode)->i_metadata_bhs->inode_blk = iloc->bh->b_blocknr; The series is great overall. The only thing I think we should change is that we should hide this EXT4_I(inode)->i_metadata_bhs->inode_blk = iloc->bh->b_blocknr; behind a dedicated static inline/regular function call instead of open-coding it everywhere. Can then also be paired with some VFS_WARN_ON_ONCE() to detect garbage bh->b_blocknr.
On Mon 11-05-26 15:30:34, Christian Brauner wrote: > On Mon, May 11, 2026 at 02:13:59PM +0200, Jan Kara wrote: > > Use mmb inode buffer writeout infrastructure to reliably write out > > inode's inode table block on fsync(2) in nojournal mode (from > > ext4_sync_parent() and ext4_fsync_nojournal()). This significantly > > simplifies the code as we don't have to explicitely handle inode buffer > > writeback in ext4_write_inode() and thus we can also remove > > sync_inode_metadata() calls from ext4_sync_parent() and > > ext4_write_inode() call from ext4_fsync_nojournal(). > > > > Signed-off-by: Jan Kara <jack@suse.cz> ... > > @@ -6348,7 +6330,11 @@ int ext4_mark_iloc_dirty(handle_t *handle, > > > > /* the do_update_inode consumes one bh->b_count */ > > get_bh(iloc->bh); > > - > > + if (!ext4_handle_valid(handle)) { > > + if (!EXT4_I(inode)->i_metadata_bhs) > > + ext4_inode_attach_mmb(inode); > > + EXT4_I(inode)->i_metadata_bhs->inode_blk = iloc->bh->b_blocknr; > > The series is great overall. The only thing I think we should change is > that we should hide this > > EXT4_I(inode)->i_metadata_bhs->inode_blk = iloc->bh->b_blocknr; > > behind a dedicated static inline/regular function call instead of > open-coding it everywhere. Can then also be paired with some > VFS_WARN_ON_ONCE() to detect garbage bh->b_blocknr. Good point. I've created mmb_mark_inode_buffer_dirty() helper for this matching mmb_mark_buffer_dirty() we use for standard metadata buffers. It now also handles dirtying the buffer and synchronization with mmb_sync() clearing the inode_blk. Honza
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 74f05bd0cdde..6bbaf72108fd 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -350,7 +350,7 @@ int __ext4_journal_get_create_access(const char *where, unsigned int line, return 0; } -static void ext4_inode_attach_mmb(struct inode *inode) +void ext4_inode_attach_mmb(struct inode *inode) { struct mapping_metadata_bhs *mmb; diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 63d17c5201b5..2a01b8279c88 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -122,6 +122,8 @@ #define EXT4_HT_EXT_CONVERT 11 #define EXT4_HT_MAX 12 +void ext4_inode_attach_mmb(struct inode *inode); + int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index e25d365e1179..af84489e57c6 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -75,9 +75,6 @@ static int ext4_sync_parent(struct inode *inode) if (ret) break; } - ret = sync_inode_metadata(inode, 1); - if (ret) - break; } dput(dentry); return ret; @@ -87,10 +84,6 @@ static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end, int datasync, bool *needs_barrier) { struct inode *inode = file->f_inode; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, - }; int ret; ret = mmb_fsync_noflush(file, EXT4_I(inode)->i_metadata_bhs, @@ -98,11 +91,6 @@ static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end, if (ret) return ret; - /* Force writeout of inode table buffer to disk */ - ret = ext4_write_inode(inode, &wbc); - if (ret) - return ret; - ret = ext4_sync_parent(inode); if (test_opt(inode->i_sb, BARRIER)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3e66e9510909..09506b4de1b2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5786,24 +5786,6 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid); - } else { - struct ext4_iloc iloc; - - err = __ext4_get_inode_loc_noinmem(inode, &iloc); - if (err) - return err; - /* - * sync(2) will flush the whole buffer cache. No need to do - * it here separately for each inode. - */ - if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) - sync_dirty_buffer(iloc.bh); - if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { - ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO, - "IO error syncing inode"); - err = -EIO; - } - brelse(iloc.bh); } return err; } @@ -6348,7 +6330,11 @@ int ext4_mark_iloc_dirty(handle_t *handle, /* the do_update_inode consumes one bh->b_count */ get_bh(iloc->bh); - + if (!ext4_handle_valid(handle)) { + if (!EXT4_I(inode)->i_metadata_bhs) + ext4_inode_attach_mmb(inode); + EXT4_I(inode)->i_metadata_bhs->inode_blk = iloc->bh->b_blocknr; + } /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ err = ext4_do_update_inode(handle, inode, iloc); put_bh(iloc->bh);
Use mmb inode buffer writeout infrastructure to reliably write out inode's inode table block on fsync(2) in nojournal mode (from ext4_sync_parent() and ext4_fsync_nojournal()). This significantly simplifies the code as we don't have to explicitely handle inode buffer writeback in ext4_write_inode() and thus we can also remove sync_inode_metadata() calls from ext4_sync_parent() and ext4_write_inode() call from ext4_fsync_nojournal(). Signed-off-by: Jan Kara <jack@suse.cz> --- fs/ext4/ext4_jbd2.c | 2 +- fs/ext4/ext4_jbd2.h | 2 ++ fs/ext4/fsync.c | 12 ------------ fs/ext4/inode.c | 24 +++++------------------- 4 files changed, 8 insertions(+), 32 deletions(-)