Message ID | 1231945948-23676-2-git-send-email-jack@suse.cz |
---|---|
State | Not Applicable, archived |
Headers | show |
Jan Kara wrote: > To be really safe that the data hit the platter, we should also flush drive's > writeback caches on fsync and for O_SYNC files or O_DIRSYNC inodes. Seems sane, but aren't we getting really divergent behavior here between ext2, ext3, and ext4 w.r.t. drive cache flushing for sync paths? (at least, ext3 has no calls to flush cache at this point) -Eric > Signed-off-by: Jan Kara <jack@suse.cz> > CC: pavel@suse.cz > --- > fs/ext2/dir.c | 5 ++++- > fs/ext2/fsync.c | 7 +++++-- > fs/ext2/inode.c | 7 +++++++ > fs/ext2/xattr.c | 6 +++++- > 4 files changed, 21 insertions(+), 4 deletions(-) > > diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c > index 2999d72..d6cb59f 100644 > --- a/fs/ext2/dir.c > +++ b/fs/ext2/dir.c > @@ -25,6 +25,7 @@ > #include <linux/buffer_head.h> > #include <linux/pagemap.h> > #include <linux/swap.h> > +#include <linux/blkdev.h> /* for blkdev_issue_flush() */ > > typedef struct ext2_dir_entry_2 ext2_dirent; > > @@ -97,8 +98,10 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) > > if (IS_DIRSYNC(dir)) { > err = write_one_page(page, 1); > - if (!err) > + if (!err) { > err = ext2_sync_inode(dir); > + blkdev_issue_flush(dir->i_sb->s_bdev, NULL); > + } > } else { > unlock_page(page); > } > diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c > index fc66c93..9cd1838 100644 > --- a/fs/ext2/fsync.c > +++ b/fs/ext2/fsync.c > @@ -24,6 +24,7 @@ > > #include "ext2.h" > #include <linux/buffer_head.h> /* for sync_mapping_buffers() */ > +#include <linux/blkdev.h> /* for blkdev_issue_flush() */ > > > /* > @@ -39,12 +40,14 @@ int ext2_sync_file(struct file *file, struct dentry *dentry, int datasync) > > ret = sync_mapping_buffers(inode->i_mapping); > if (!(inode->i_state & I_DIRTY)) > - return ret; > + goto out; > if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) > - return ret; > + goto out; > > err = ext2_sync_inode(inode); > if (ret == 0) > ret = err; > +out: > + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); > return ret; > } > diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c > index 23fff2f..49b479e 100644 > --- a/fs/ext2/inode.c > +++ b/fs/ext2/inode.c > @@ -33,6 +33,7 @@ > #include <linux/mpage.h> > #include <linux/fiemap.h> > #include <linux/namei.h> > +#include <linux/blkdev.h> /* for blkdev_issue_flush() */ > #include "ext2.h" > #include "acl.h" > #include "xip.h" > @@ -68,9 +69,14 @@ void ext2_delete_inode (struct inode * inode) > mark_inode_dirty(inode); > ext2_update_inode(inode, inode_needs_sync(inode)); > > + /* Make sure inode deletion really gets to disk. Disk write caches > + * are flushed either in ext2_truncate() or we do it explicitly */ > inode->i_size = 0; > if (inode->i_blocks) > ext2_truncate (inode); > + else if (inode_needs_sync(inode)) > + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); > + > ext2_free_inode (inode); > > return; > @@ -1104,6 +1110,7 @@ do_indirects: > if (inode_needs_sync(inode)) { > sync_mapping_buffers(inode->i_mapping); > ext2_sync_inode (inode); > + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); > } else { > mark_inode_dirty(inode); > } > diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c > index 987a526..d480216 100644 > --- a/fs/ext2/xattr.c > +++ b/fs/ext2/xattr.c > @@ -60,6 +60,7 @@ > #include <linux/mbcache.h> > #include <linux/quotaops.h> > #include <linux/rwsem.h> > +#include <linux/blkdev.h> /* for blkdev_issue_flush() */ > #include "ext2.h" > #include "xattr.h" > #include "acl.h" > @@ -702,6 +703,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, > DQUOT_FREE_BLOCK(inode, 1); > goto cleanup; > } > + blkdev_issue_flush(sb->s_bdev, NULL); > } else > mark_inode_dirty(inode); > > @@ -792,8 +794,10 @@ ext2_xattr_delete_inode(struct inode *inode) > le32_to_cpu(HDR(bh)->h_refcount)); > unlock_buffer(bh); > mark_buffer_dirty(bh); > - if (IS_SYNC(inode)) > + if (IS_SYNC(inode)) { > sync_dirty_buffer(bh); > + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); > + } > DQUOT_FREE_BLOCK(inode, 1); > } > EXT2_I(inode)->i_file_acl = 0; -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed 14-01-09 11:35:50, Eric Sandeen wrote: > Jan Kara wrote: > > To be really safe that the data hit the platter, we should also flush drive's > > writeback caches on fsync and for O_SYNC files or O_DIRSYNC inodes. > > Seems sane, but aren't we getting really divergent behavior here between > ext2, ext3, and ext4 w.r.t. drive cache flushing for sync paths? Well, but ext3/4 should do a barrier on a transaction commit (if the user really cares about data integrity) and hence it implicitely does the same. Honza
Jan Kara wrote: > On Wed 14-01-09 11:35:50, Eric Sandeen wrote: >> Jan Kara wrote: >>> To be really safe that the data hit the platter, we should also flush drive's >>> writeback caches on fsync and for O_SYNC files or O_DIRSYNC inodes. >> Seems sane, but aren't we getting really divergent behavior here between >> ext2, ext3, and ext4 w.r.t. drive cache flushing for sync paths? > Well, but ext3/4 should do a barrier on a transaction commit (if the user > really cares about data integrity) and hence it implicitely does the same. > > Honza Sorry, just now catching up with that other thread, which addresses this topic. :) But yes, this seems correct... (OTOH, I thought SuSE was carrying a patch which did add this blkdev_flush to sync for ext3...) -Eric -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 14 Jan 2009 16:12:28 +0100 Jan Kara <jack@suse.cz> wrote: > To be really safe that the data hit the platter, we should also flush drive's > writeback caches on fsync and for O_SYNC files or O_DIRSYNC inodes. > It's not good to randomly sprinkle blkdev_issue_flush() calls all over the filesystem like this. How do we know that you didn't miss a site? How do we ensure that people who modify the fs in the future don't forget to add the blkdev_issue_flush() call, if needed? IOW, it is fragile. Is there anything we can do to make this more robust? Do the flush calls from some higher-level callsite? Perhaps even the VFS? > @@ -97,8 +98,10 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) > > if (IS_DIRSYNC(dir)) { > err = write_one_page(page, 1); > - if (!err) > + if (!err) { > err = ext2_sync_inode(dir); > + blkdev_issue_flush(dir->i_sb->s_bdev, NULL); The patch itself would have been a bit neater if it had added int ext3_blkdev_issue_flush(struct inode *inode) and called that, IMO. Also, the changelog needs some work, methinks. /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for * @error_sector: error sector * * Description: * Issue a flush for the block device in question. Caller can supply * room for storing the error offset in case of a flush error, if they * wish to. Caller must run wait_for_completion() on its own. */ So afaict the change you've made is incomplete. We'll queue a writeback command to the disk but we won't wait for it to be sent down the wire. Nor do we wait for the command to complete at the device end. So it can still be a looong time (seconds!) before the data which the user thinks is on disk really is safe. Yes? If so, this design decision should be described in the changelog, and justified. Actually, doing this in the comment over ext3_blkdev_issue_flush() would be good. -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi! > Also, the changelog needs some work, methinks. > > /** > * blkdev_issue_flush - queue a flush > * @bdev: blockdev to issue flush for > * @error_sector: error sector > * > * Description: > * Issue a flush for the block device in question. Caller can supply > * room for storing the error offset in case of a flush error, if they > * wish to. Caller must run wait_for_completion() on its own. > */ > > So afaict the change you've made is incomplete. We'll queue a > writeback command to the disk but we won't wait for it to be sent down > the wire. Nor do we wait for the command to complete at the device > end. So it can still be a looong time (seconds!) before the data which > the user thinks is on disk really is safe. > > Yes? > > If so, this design decision should be described in the changelog, and > justified. Actually, doing this in the comment over > ext3_blkdev_issue_flush() would be good. Actually, if sync() and fsync() do not work, it probably needs more than a comment in changelog. Like comment in documentation, in big bold letters. Pavel
On Wed, Jan 14, 2009 at 10:18:34AM -0800, Andrew Morton wrote: > On Wed, 14 Jan 2009 16:12:28 +0100 Jan Kara <jack@suse.cz> wrote: > > > To be really safe that the data hit the platter, we should also flush drive's > > writeback caches on fsync and for O_SYNC files or O_DIRSYNC inodes. > > > > It's not good to randomly sprinkle blkdev_issue_flush() calls all over > the filesystem like this. How do we know that you didn't miss a site? > How do we ensure that people who modify the fs in the future don't > forget to add the blkdev_issue_flush() call, if needed? > > IOW, it is fragile. Is there anything we can do to make this more > robust? Do the flush calls from some higher-level callsite? Perhaps > even the VFS? The problem with doing it in the VFS is that we might end up calling flush twice; it may very well be that for some filesystems, we'll be calling the flush operation as part of writing out the commit block, for example. I'm not sure it's fair to say that we need to "randomly sprinkle blkdev_issue_flush() calls all over the place". The number of places where we need to do O_SYNC or fsync() handling is in fact quite small. We were simply ignoring this problem before. > > + blkdev_issue_flush(dir->i_sb->s_bdev, NULL); > > The patch itself would have been a bit neater if it had added > > int ext3_blkdev_issue_flush(struct inode *inode) What, just to avoid needing the "..->i_sb->s_bdev" construction? I personally find extra levels of inline functions to be harder to deal with long term, since I then I have to track down the definition of ext3_blkdev_issue_flush in the header files to make sure there isn't any other magic hiding in there other than the i->i_sb->s_bdev derefencing. - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 14 Jan 2009 17:09:00 -0500 Theodore Tso <tytso@MIT.EDU> wrote: > On Wed, Jan 14, 2009 at 10:18:34AM -0800, Andrew Morton wrote: > > On Wed, 14 Jan 2009 16:12:28 +0100 Jan Kara <jack@suse.cz> wrote: > > > > > To be really safe that the data hit the platter, we should also flush drive's > > > writeback caches on fsync and for O_SYNC files or O_DIRSYNC inodes. > > > > > > > It's not good to randomly sprinkle blkdev_issue_flush() calls all over > > the filesystem like this. How do we know that you didn't miss a site? > > How do we ensure that people who modify the fs in the future don't > > forget to add the blkdev_issue_flush() call, if needed? > > > > IOW, it is fragile. Is there anything we can do to make this more > > robust? Do the flush calls from some higher-level callsite? Perhaps > > even the VFS? > > The problem with doing it in the VFS is that we might end up calling > flush twice; it may very well be that for some filesystems, we'll be > calling the flush operation as part of writing out the commit block, > for example. It depends how it's done. One way would be via a new address_space_operations entry. > I'm not sure it's fair to say that we need to "randomly sprinkle > blkdev_issue_flush() calls all over the place". The number of places > where we need to do O_SYNC or fsync() handling is in fact quite small. > We were simply ignoring this problem before. Well it _does_ sprinkle them all over. > > > + blkdev_issue_flush(dir->i_sb->s_bdev, NULL); > > > > The patch itself would have been a bit neater if it had added > > > > int ext3_blkdev_issue_flush(struct inode *inode) > > What, just to avoid needing the "..->i_sb->s_bdev" construction? Partly as a cleanup, yes. But such a function then becomes a site where we can add commentary explaining the reason for its existence, along with commentary which describes the semantics in details, exceptions, caveats, etc. It also becomes a single site where the feature can be disabled for those who wish to evaluate its performance cost. The single site can also be used for convenient gathering of timing information. One thing we may well choose to do is to stop issuing the flush if blkdev_issue_flush() is returning -EOPNOTSUPP, which some devices will presumably do. Reissuing a pointless and quite expensive command over and over again doesn't seem clever. And I can probably think of more reasons :) > I > personally find extra levels of inline functions It shouldn't be inlined. > to be harder to deal > with long term, since I then I have to track down the definition of > ext3_blkdev_issue_flush in the header files to make sure there isn't > any other magic hiding in there other than the i->i_sb->s_bdev > derefencing. > > - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 2999d72..d6cb59f 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -25,6 +25,7 @@ #include <linux/buffer_head.h> #include <linux/pagemap.h> #include <linux/swap.h> +#include <linux/blkdev.h> /* for blkdev_issue_flush() */ typedef struct ext2_dir_entry_2 ext2_dirent; @@ -97,8 +98,10 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) if (IS_DIRSYNC(dir)) { err = write_one_page(page, 1); - if (!err) + if (!err) { err = ext2_sync_inode(dir); + blkdev_issue_flush(dir->i_sb->s_bdev, NULL); + } } else { unlock_page(page); } diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c index fc66c93..9cd1838 100644 --- a/fs/ext2/fsync.c +++ b/fs/ext2/fsync.c @@ -24,6 +24,7 @@ #include "ext2.h" #include <linux/buffer_head.h> /* for sync_mapping_buffers() */ +#include <linux/blkdev.h> /* for blkdev_issue_flush() */ /* @@ -39,12 +40,14 @@ int ext2_sync_file(struct file *file, struct dentry *dentry, int datasync) ret = sync_mapping_buffers(inode->i_mapping); if (!(inode->i_state & I_DIRTY)) - return ret; + goto out; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - return ret; + goto out; err = ext2_sync_inode(inode); if (ret == 0) ret = err; +out: + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); return ret; } diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 23fff2f..49b479e 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -33,6 +33,7 @@ #include <linux/mpage.h> #include <linux/fiemap.h> #include <linux/namei.h> +#include <linux/blkdev.h> /* for blkdev_issue_flush() */ #include "ext2.h" #include "acl.h" #include "xip.h" @@ -68,9 +69,14 @@ void ext2_delete_inode (struct inode * inode) mark_inode_dirty(inode); ext2_update_inode(inode, inode_needs_sync(inode)); + /* Make sure inode deletion really gets to disk. Disk write caches + * are flushed either in ext2_truncate() or we do it explicitly */ inode->i_size = 0; if (inode->i_blocks) ext2_truncate (inode); + else if (inode_needs_sync(inode)) + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + ext2_free_inode (inode); return; @@ -1104,6 +1110,7 @@ do_indirects: if (inode_needs_sync(inode)) { sync_mapping_buffers(inode->i_mapping); ext2_sync_inode (inode); + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); } else { mark_inode_dirty(inode); } diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 987a526..d480216 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -60,6 +60,7 @@ #include <linux/mbcache.h> #include <linux/quotaops.h> #include <linux/rwsem.h> +#include <linux/blkdev.h> /* for blkdev_issue_flush() */ #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -702,6 +703,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, DQUOT_FREE_BLOCK(inode, 1); goto cleanup; } + blkdev_issue_flush(sb->s_bdev, NULL); } else mark_inode_dirty(inode); @@ -792,8 +794,10 @@ ext2_xattr_delete_inode(struct inode *inode) le32_to_cpu(HDR(bh)->h_refcount)); unlock_buffer(bh); mark_buffer_dirty(bh); - if (IS_SYNC(inode)) + if (IS_SYNC(inode)) { sync_dirty_buffer(bh); + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + } DQUOT_FREE_BLOCK(inode, 1); } EXT2_I(inode)->i_file_acl = 0;
To be really safe that the data hit the platter, we should also flush drive's writeback caches on fsync and for O_SYNC files or O_DIRSYNC inodes. Signed-off-by: Jan Kara <jack@suse.cz> CC: pavel@suse.cz --- fs/ext2/dir.c | 5 ++++- fs/ext2/fsync.c | 7 +++++-- fs/ext2/inode.c | 7 +++++++ fs/ext2/xattr.c | 6 +++++- 4 files changed, 21 insertions(+), 4 deletions(-)