diff mbox

[2/2] ext2: Add blk_issue_flush() to syncing paths

Message ID 1231945948-23676-2-git-send-email-jack@suse.cz
State Not Applicable, archived
Headers show

Commit Message

Jan Kara Jan. 14, 2009, 3:12 p.m. UTC
To be really safe that the data hit the platter, we should also flush drive's
writeback caches on fsync and for O_SYNC files or O_DIRSYNC inodes.

Signed-off-by: Jan Kara <jack@suse.cz>
CC: pavel@suse.cz
---
 fs/ext2/dir.c   |    5 ++++-
 fs/ext2/fsync.c |    7 +++++--
 fs/ext2/inode.c |    7 +++++++
 fs/ext2/xattr.c |    6 +++++-
 4 files changed, 21 insertions(+), 4 deletions(-)

Comments

Eric Sandeen Jan. 14, 2009, 5:35 p.m. UTC | #1
Jan Kara wrote:
> To be really safe that the data hit the platter, we should also flush drive's
> writeback caches on fsync and for O_SYNC files or O_DIRSYNC inodes.

Seems sane, but aren't we getting really divergent behavior here between
ext2, ext3, and ext4 w.r.t. drive cache flushing for sync paths?

(at least, ext3 has no calls to flush cache at this point)

-Eric

> Signed-off-by: Jan Kara <jack@suse.cz>
> CC: pavel@suse.cz
> ---
>  fs/ext2/dir.c   |    5 ++++-
>  fs/ext2/fsync.c |    7 +++++--
>  fs/ext2/inode.c |    7 +++++++
>  fs/ext2/xattr.c |    6 +++++-
>  4 files changed, 21 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
> index 2999d72..d6cb59f 100644
> --- a/fs/ext2/dir.c
> +++ b/fs/ext2/dir.c
> @@ -25,6 +25,7 @@
>  #include <linux/buffer_head.h>
>  #include <linux/pagemap.h>
>  #include <linux/swap.h>
> +#include <linux/blkdev.h>		/* for blkdev_issue_flush() */
>  
>  typedef struct ext2_dir_entry_2 ext2_dirent;
>  
> @@ -97,8 +98,10 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
>  
>  	if (IS_DIRSYNC(dir)) {
>  		err = write_one_page(page, 1);
> -		if (!err)
> +		if (!err) {
>  			err = ext2_sync_inode(dir);
> +			blkdev_issue_flush(dir->i_sb->s_bdev, NULL);
> +		}
>  	} else {
>  		unlock_page(page);
>  	}
> diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c
> index fc66c93..9cd1838 100644
> --- a/fs/ext2/fsync.c
> +++ b/fs/ext2/fsync.c
> @@ -24,6 +24,7 @@
>  
>  #include "ext2.h"
>  #include <linux/buffer_head.h>		/* for sync_mapping_buffers() */
> +#include <linux/blkdev.h>		/* for blkdev_issue_flush() */
>  
>  
>  /*
> @@ -39,12 +40,14 @@ int ext2_sync_file(struct file *file, struct dentry *dentry, int datasync)
>  
>  	ret = sync_mapping_buffers(inode->i_mapping);
>  	if (!(inode->i_state & I_DIRTY))
> -		return ret;
> +		goto out;
>  	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
> -		return ret;
> +		goto out;
>  
>  	err = ext2_sync_inode(inode);
>  	if (ret == 0)
>  		ret = err;
> +out:
> +	blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
>  	return ret;
>  }
> diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
> index 23fff2f..49b479e 100644
> --- a/fs/ext2/inode.c
> +++ b/fs/ext2/inode.c
> @@ -33,6 +33,7 @@
>  #include <linux/mpage.h>
>  #include <linux/fiemap.h>
>  #include <linux/namei.h>
> +#include <linux/blkdev.h>	/* for blkdev_issue_flush() */
>  #include "ext2.h"
>  #include "acl.h"
>  #include "xip.h"
> @@ -68,9 +69,14 @@ void ext2_delete_inode (struct inode * inode)
>  	mark_inode_dirty(inode);
>  	ext2_update_inode(inode, inode_needs_sync(inode));
>  
> +	/* Make sure inode deletion really gets to disk. Disk write caches
> +	 * are flushed either in ext2_truncate() or we do it explicitly */
>  	inode->i_size = 0;
>  	if (inode->i_blocks)
>  		ext2_truncate (inode);
> +	else if (inode_needs_sync(inode))
> +		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
> +
>  	ext2_free_inode (inode);
>  
>  	return;
> @@ -1104,6 +1110,7 @@ do_indirects:
>  	if (inode_needs_sync(inode)) {
>  		sync_mapping_buffers(inode->i_mapping);
>  		ext2_sync_inode (inode);
> +		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
>  	} else {
>  		mark_inode_dirty(inode);
>  	}
> diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
> index 987a526..d480216 100644
> --- a/fs/ext2/xattr.c
> +++ b/fs/ext2/xattr.c
> @@ -60,6 +60,7 @@
>  #include <linux/mbcache.h>
>  #include <linux/quotaops.h>
>  #include <linux/rwsem.h>
> +#include <linux/blkdev.h>		/* for blkdev_issue_flush() */
>  #include "ext2.h"
>  #include "xattr.h"
>  #include "acl.h"
> @@ -702,6 +703,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
>  				DQUOT_FREE_BLOCK(inode, 1);
>  			goto cleanup;
>  		}
> +		blkdev_issue_flush(sb->s_bdev, NULL);
>  	} else
>  		mark_inode_dirty(inode);
>  
> @@ -792,8 +794,10 @@ ext2_xattr_delete_inode(struct inode *inode)
>  			le32_to_cpu(HDR(bh)->h_refcount));
>  		unlock_buffer(bh);
>  		mark_buffer_dirty(bh);
> -		if (IS_SYNC(inode))
> +		if (IS_SYNC(inode)) {
>  			sync_dirty_buffer(bh);
> +			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
> +		}
>  		DQUOT_FREE_BLOCK(inode, 1);
>  	}
>  	EXT2_I(inode)->i_file_acl = 0;

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara Jan. 14, 2009, 5:40 p.m. UTC | #2
On Wed 14-01-09 11:35:50, Eric Sandeen wrote:
> Jan Kara wrote:
> > To be really safe that the data hit the platter, we should also flush drive's
> > writeback caches on fsync and for O_SYNC files or O_DIRSYNC inodes.
> 
> Seems sane, but aren't we getting really divergent behavior here between
> ext2, ext3, and ext4 w.r.t. drive cache flushing for sync paths?
  Well, but ext3/4 should do a barrier on a transaction commit (if the user
really cares about data integrity) and hence it implicitely does the same.

									Honza
Eric Sandeen Jan. 14, 2009, 5:47 p.m. UTC | #3
Jan Kara wrote:
> On Wed 14-01-09 11:35:50, Eric Sandeen wrote:
>> Jan Kara wrote:
>>> To be really safe that the data hit the platter, we should also flush drive's
>>> writeback caches on fsync and for O_SYNC files or O_DIRSYNC inodes.
>> Seems sane, but aren't we getting really divergent behavior here between
>> ext2, ext3, and ext4 w.r.t. drive cache flushing for sync paths?
>   Well, but ext3/4 should do a barrier on a transaction commit (if the user
> really cares about data integrity) and hence it implicitely does the same.
> 
> 									Honza

Sorry, just now catching up with that other thread, which addresses this
topic.  :)  But yes, this seems correct...  (OTOH, I thought SuSE was
carrying a patch which did add this blkdev_flush to sync for ext3...)

-Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Morton Jan. 14, 2009, 6:18 p.m. UTC | #4
On Wed, 14 Jan 2009 16:12:28 +0100 Jan Kara <jack@suse.cz> wrote:

> To be really safe that the data hit the platter, we should also flush drive's
> writeback caches on fsync and for O_SYNC files or O_DIRSYNC inodes.
> 

It's not good to randomly sprinkle blkdev_issue_flush() calls all over
the filesystem like this.  How do we know that you didn't miss a site? 
How do we ensure that people who modify the fs in the future don't
forget to add the blkdev_issue_flush() call, if needed?

IOW, it is fragile.  Is there anything we can do to make this more
robust?  Do the flush calls from some higher-level callsite?  Perhaps
even the VFS?

> @@ -97,8 +98,10 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
>  
>  	if (IS_DIRSYNC(dir)) {
>  		err = write_one_page(page, 1);
> -		if (!err)
> +		if (!err) {
>  			err = ext2_sync_inode(dir);
> +			blkdev_issue_flush(dir->i_sb->s_bdev, NULL);

The patch itself would have been a bit neater if it had added

int ext3_blkdev_issue_flush(struct inode *inode)

and called that, IMO.


Also, the changelog needs some work, methinks.

/**
 * blkdev_issue_flush - queue a flush
 * @bdev:	blockdev to issue flush for
 * @error_sector:	error sector
 *
 * Description:
 *    Issue a flush for the block device in question. Caller can supply
 *    room for storing the error offset in case of a flush error, if they
 *    wish to.  Caller must run wait_for_completion() on its own.
 */

So afaict the change you've made is incomplete.  We'll queue a
writeback command to the disk but we won't wait for it to be sent down
the wire.  Nor do we wait for the command to complete at the device
end.  So it can still be a looong time (seconds!) before the data which
the user thinks is on disk really is safe.

Yes?

If so, this design decision should be described in the changelog, and
justified.  Actually, doing this in the comment over
ext3_blkdev_issue_flush() would be good.


--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pavel Machek Jan. 14, 2009, 6:27 p.m. UTC | #5
Hi!

> Also, the changelog needs some work, methinks.
> 
> /**
>  * blkdev_issue_flush - queue a flush
>  * @bdev:	blockdev to issue flush for
>  * @error_sector:	error sector
>  *
>  * Description:
>  *    Issue a flush for the block device in question. Caller can supply
>  *    room for storing the error offset in case of a flush error, if they
>  *    wish to.  Caller must run wait_for_completion() on its own.
>  */
> 
> So afaict the change you've made is incomplete.  We'll queue a
> writeback command to the disk but we won't wait for it to be sent down
> the wire.  Nor do we wait for the command to complete at the device
> end.  So it can still be a looong time (seconds!) before the data which
> the user thinks is on disk really is safe.
> 
> Yes?
> 
> If so, this design decision should be described in the changelog, and
> justified.  Actually, doing this in the comment over
> ext3_blkdev_issue_flush() would be good.

Actually, if sync() and fsync() do not work, it probably needs more
than a comment in changelog. Like comment in documentation, in big
bold letters.
									Pavel
Theodore Y. Ts'o Jan. 14, 2009, 10:09 p.m. UTC | #6
On Wed, Jan 14, 2009 at 10:18:34AM -0800, Andrew Morton wrote:
> On Wed, 14 Jan 2009 16:12:28 +0100 Jan Kara <jack@suse.cz> wrote:
> 
> > To be really safe that the data hit the platter, we should also flush drive's
> > writeback caches on fsync and for O_SYNC files or O_DIRSYNC inodes.
> > 
> 
> It's not good to randomly sprinkle blkdev_issue_flush() calls all over
> the filesystem like this.  How do we know that you didn't miss a site? 
> How do we ensure that people who modify the fs in the future don't
> forget to add the blkdev_issue_flush() call, if needed?
> 
> IOW, it is fragile.  Is there anything we can do to make this more
> robust?  Do the flush calls from some higher-level callsite?  Perhaps
> even the VFS?

The problem with doing it in the VFS is that we might end up calling
flush twice; it may very well be that for some filesystems, we'll be
calling the flush operation as part of writing out the commit block,
for example.

I'm not sure it's fair to say that we need to "randomly sprinkle
blkdev_issue_flush() calls all over the place".  The number of places
where we need to do O_SYNC or fsync() handling is in fact quite small.
We were simply ignoring this problem before.

> > +			blkdev_issue_flush(dir->i_sb->s_bdev, NULL);
> 
> The patch itself would have been a bit neater if it had added
> 
> int ext3_blkdev_issue_flush(struct inode *inode)

What, just to avoid needing the "..->i_sb->s_bdev" construction?  I
personally find extra levels of inline functions to be harder to deal
with long term, since I then I have to track down the definition of
ext3_blkdev_issue_flush in the header files to make sure there isn't
any other magic hiding in there other than the i->i_sb->s_bdev
derefencing.

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Morton Jan. 14, 2009, 10:26 p.m. UTC | #7
On Wed, 14 Jan 2009 17:09:00 -0500
Theodore Tso <tytso@MIT.EDU> wrote:

> On Wed, Jan 14, 2009 at 10:18:34AM -0800, Andrew Morton wrote:
> > On Wed, 14 Jan 2009 16:12:28 +0100 Jan Kara <jack@suse.cz> wrote:
> > 
> > > To be really safe that the data hit the platter, we should also flush drive's
> > > writeback caches on fsync and for O_SYNC files or O_DIRSYNC inodes.
> > > 
> > 
> > It's not good to randomly sprinkle blkdev_issue_flush() calls all over
> > the filesystem like this.  How do we know that you didn't miss a site? 
> > How do we ensure that people who modify the fs in the future don't
> > forget to add the blkdev_issue_flush() call, if needed?
> > 
> > IOW, it is fragile.  Is there anything we can do to make this more
> > robust?  Do the flush calls from some higher-level callsite?  Perhaps
> > even the VFS?
> 
> The problem with doing it in the VFS is that we might end up calling
> flush twice; it may very well be that for some filesystems, we'll be
> calling the flush operation as part of writing out the commit block,
> for example.

It depends how it's done.  One way would be via a new
address_space_operations entry.

> I'm not sure it's fair to say that we need to "randomly sprinkle
> blkdev_issue_flush() calls all over the place".  The number of places
> where we need to do O_SYNC or fsync() handling is in fact quite small.
> We were simply ignoring this problem before.

Well it _does_ sprinkle them all over.

> > > +			blkdev_issue_flush(dir->i_sb->s_bdev, NULL);
> > 
> > The patch itself would have been a bit neater if it had added
> > 
> > int ext3_blkdev_issue_flush(struct inode *inode)
> 
> What, just to avoid needing the "..->i_sb->s_bdev" construction?

Partly as a cleanup, yes.  But such a function then becomes a site
where we can add commentary explaining the reason for its existence,
along with commentary which describes the semantics in details,
exceptions, caveats, etc.

It also becomes a single site where the feature can be disabled for
those who wish to evaluate its performance cost.

The single site can also be used for convenient gathering of timing
information.

One thing we may well choose to do is to stop issuing the flush if
blkdev_issue_flush() is returning -EOPNOTSUPP, which some devices will
presumably do.  Reissuing a pointless and quite expensive command over
and over again doesn't seem clever.

And I can probably think of more reasons :)


>  I
> personally find extra levels of inline functions

It shouldn't be inlined.

> to be harder to deal
> with long term, since I then I have to track down the definition of
> ext3_blkdev_issue_flush in the header files to make sure there isn't
> any other magic hiding in there other than the i->i_sb->s_bdev
> derefencing.
> 
> 						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2999d72..d6cb59f 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -25,6 +25,7 @@ 
 #include <linux/buffer_head.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
+#include <linux/blkdev.h>		/* for blkdev_issue_flush() */
 
 typedef struct ext2_dir_entry_2 ext2_dirent;
 
@@ -97,8 +98,10 @@  static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
 
 	if (IS_DIRSYNC(dir)) {
 		err = write_one_page(page, 1);
-		if (!err)
+		if (!err) {
 			err = ext2_sync_inode(dir);
+			blkdev_issue_flush(dir->i_sb->s_bdev, NULL);
+		}
 	} else {
 		unlock_page(page);
 	}
diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c
index fc66c93..9cd1838 100644
--- a/fs/ext2/fsync.c
+++ b/fs/ext2/fsync.c
@@ -24,6 +24,7 @@ 
 
 #include "ext2.h"
 #include <linux/buffer_head.h>		/* for sync_mapping_buffers() */
+#include <linux/blkdev.h>		/* for blkdev_issue_flush() */
 
 
 /*
@@ -39,12 +40,14 @@  int ext2_sync_file(struct file *file, struct dentry *dentry, int datasync)
 
 	ret = sync_mapping_buffers(inode->i_mapping);
 	if (!(inode->i_state & I_DIRTY))
-		return ret;
+		goto out;
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		return ret;
+		goto out;
 
 	err = ext2_sync_inode(inode);
 	if (ret == 0)
 		ret = err;
+out:
+	blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 	return ret;
 }
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 23fff2f..49b479e 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -33,6 +33,7 @@ 
 #include <linux/mpage.h>
 #include <linux/fiemap.h>
 #include <linux/namei.h>
+#include <linux/blkdev.h>	/* for blkdev_issue_flush() */
 #include "ext2.h"
 #include "acl.h"
 #include "xip.h"
@@ -68,9 +69,14 @@  void ext2_delete_inode (struct inode * inode)
 	mark_inode_dirty(inode);
 	ext2_update_inode(inode, inode_needs_sync(inode));
 
+	/* Make sure inode deletion really gets to disk. Disk write caches
+	 * are flushed either in ext2_truncate() or we do it explicitly */
 	inode->i_size = 0;
 	if (inode->i_blocks)
 		ext2_truncate (inode);
+	else if (inode_needs_sync(inode))
+		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+
 	ext2_free_inode (inode);
 
 	return;
@@ -1104,6 +1110,7 @@  do_indirects:
 	if (inode_needs_sync(inode)) {
 		sync_mapping_buffers(inode->i_mapping);
 		ext2_sync_inode (inode);
+		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 	} else {
 		mark_inode_dirty(inode);
 	}
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 987a526..d480216 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -60,6 +60,7 @@ 
 #include <linux/mbcache.h>
 #include <linux/quotaops.h>
 #include <linux/rwsem.h>
+#include <linux/blkdev.h>		/* for blkdev_issue_flush() */
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
@@ -702,6 +703,7 @@  ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 				DQUOT_FREE_BLOCK(inode, 1);
 			goto cleanup;
 		}
+		blkdev_issue_flush(sb->s_bdev, NULL);
 	} else
 		mark_inode_dirty(inode);
 
@@ -792,8 +794,10 @@  ext2_xattr_delete_inode(struct inode *inode)
 			le32_to_cpu(HDR(bh)->h_refcount));
 		unlock_buffer(bh);
 		mark_buffer_dirty(bh);
-		if (IS_SYNC(inode))
+		if (IS_SYNC(inode)) {
 			sync_dirty_buffer(bh);
+			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+		}
 		DQUOT_FREE_BLOCK(inode, 1);
 	}
 	EXT2_I(inode)->i_file_acl = 0;