diff mbox

[2/3] ext4: honor the O_SYNC flag for aysnchronous direct I/O requests

Message ID 1327698949-12616-3-git-send-email-jmoyer@redhat.com
State Accepted, archived
Headers show

Commit Message

Jeff Moyer Jan. 27, 2012, 9:15 p.m. UTC
Hi,

If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
flushed after the write completion.  Instead, it's flushed *before* the
I/O is sent to the disk (in __generic_file_aio_write).  This patch
attempts to fix that problem by marking an I/O as requiring a cache
flush in endio processing.  I'll send a follow-on patch to the
generic write code to get rid of the bogus generic_write_sync call
when EIOCBQUEUED is returned.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
---
 fs/ext4/ext4.h    |    4 ++++
 fs/ext4/inode.c   |   11 +++++++++--
 fs/ext4/page-io.c |   39 ++++++++++++++++++++++++++++++++-------
 fs/ext4/super.c   |   11 +++++++++++
 4 files changed, 56 insertions(+), 9 deletions(-)

Comments

Jan Kara Feb. 2, 2012, 5:31 p.m. UTC | #1
Hi,

On Fri 27-01-12 16:15:48, Jeff Moyer wrote:
> If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
> flushed after the write completion.  Instead, it's flushed *before* the
> I/O is sent to the disk (in __generic_file_aio_write).  This patch
> attempts to fix that problem by marking an I/O as requiring a cache
> flush in endio processing.  I'll send a follow-on patch to the
> generic write code to get rid of the bogus generic_write_sync call
> when EIOCBQUEUED is returned.
  Thanks for the patch!

> Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
> ---
>  fs/ext4/ext4.h    |    4 ++++
>  fs/ext4/inode.c   |   11 +++++++++--
>  fs/ext4/page-io.c |   39 ++++++++++++++++++++++++++++++++-------
>  fs/ext4/super.c   |   11 +++++++++++
>  4 files changed, 56 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 2d55d7c..4377ed3 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -185,6 +185,7 @@ struct mpage_da_data {
>  #define EXT4_IO_END_ERROR	0x0002
>  #define EXT4_IO_END_QUEUED	0x0004
>  #define EXT4_IO_END_DIRECT	0x0008
> +#define EXT4_IO_END_NEEDS_SYNC	0x0010
>  
>  struct ext4_io_page {
>  	struct page	*p_page;
> @@ -1247,6 +1248,9 @@ struct ext4_sb_info {
>  	/* workqueue for dio unwritten */
>  	struct workqueue_struct *dio_unwritten_wq;
>  
> +	/* workqueue for aio+dio+o_sync disk cache flushing */
> +	struct workqueue_struct *aio_dio_flush_wq;
> +
  Hmm, looking at the patch I'm wondering why did you introduce the new
workqueue? It seems dio_unwritten_wq would be enough? You just need to
rename it to something more appropriate ;)

> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index 9e1b8eb..d07cd40 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -98,15 +98,40 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
>  		   "list->prev 0x%p\n",
>  		   io, inode->i_ino, io->list.next, io->list.prev);
>  
> -	ret = ext4_convert_unwritten_extents(inode, offset, size);
> -	if (ret < 0) {
> -		ext4_msg(inode->i_sb, KERN_EMERG,
> -			 "failed to convert unwritten extents to written "
> -			 "extents -- potential data loss!  "
> -			 "(inode %lu, offset %llu, size %zd, error %d)",
> -			 inode->i_ino, offset, size, ret);
> +	if (io->flag & EXT4_IO_END_UNWRITTEN) {
> +
> +		ret = ext4_convert_unwritten_extents(inode, offset, size);
> +		if (ret < 0) {
> +			ext4_msg(inode->i_sb, KERN_EMERG,
> +				 "failed to convert unwritten extents to "
> +				 "written extents -- potential data loss!  "
> +				 "(inode %lu, offset %llu, size %zd, error %d)",
> +				 inode->i_ino, offset, size, ret);
> +			goto endio;
> +		}
> +	}
> +
> +	/*
> +	 * This function has two callers.  The first is the end_io_work
> +	 * routine just below.  This is an asynchronous completion context.
> +	 * The second is in the fsync path.  For the latter path, we can't
> +	 * return from here until the job is done.  Hence, we issue a
> +	 * blocking blkdev_issue_flush call.
> +	 */
> +	if (io->flag & EXT4_IO_END_NEEDS_SYNC) {
> +		/*
> +		 * Ideally, we'd like to know if the force_commit routine
> +		 * actually did send something to disk.  If it didn't,
> +		 * then we need to issue the cache flush by hand.  For now,
> +		 * play it safe and do both.
> +		 */
> +		ret = ext4_force_commit(inode->i_sb);
> +		if (ret)
> +			goto endio;
> +		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
  Look at what ext4_sync_file() does. It's more efficient than this.
You need something like:
	commit_tid = file->f_flags & __O_SYNC ? EXT4_I(inode)->i_sync_tid :
						EXT4_I(inode)->i_datasync_tid;
	if (journal->j_flags & JBD2_BARRIER &&
	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
		needs_barrier = true;
	jbd2_log_start_commit(journal, commit_tid);
	jbd2_log_wait_commit(journal, commit_tid);
	if (needs_barrier)
		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);

								Honza
Jeff Moyer Feb. 6, 2012, 4:20 p.m. UTC | #2
Jan Kara <jack@suse.cz> writes:

>> +	/* workqueue for aio+dio+o_sync disk cache flushing */
>> +	struct workqueue_struct *aio_dio_flush_wq;
>> +
>   Hmm, looking at the patch I'm wondering why did you introduce the new
> workqueue? It seems dio_unwritten_wq would be enough? You just need to
> rename it to something more appropriate ;)

I used a new workqueue as the operations are blocking, and I didn't want
to hold up other progress.  If you think re-using the unwritten_wq is
the right thing to do, I'm happy to comply.

>> +	/*
>> +	 * This function has two callers.  The first is the end_io_work
>> +	 * routine just below.  This is an asynchronous completion context.
>> +	 * The second is in the fsync path.  For the latter path, we can't
>> +	 * return from here until the job is done.  Hence, we issue a
>> +	 * blocking blkdev_issue_flush call.
>> +	 */
>> +	if (io->flag & EXT4_IO_END_NEEDS_SYNC) {
>> +		/*
>> +		 * Ideally, we'd like to know if the force_commit routine
>> +		 * actually did send something to disk.  If it didn't,
>> +		 * then we need to issue the cache flush by hand.  For now,
>> +		 * play it safe and do both.
>> +		 */
>> +		ret = ext4_force_commit(inode->i_sb);
>> +		if (ret)
>> +			goto endio;
>> +		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
>   Look at what ext4_sync_file() does. It's more efficient than this.
> You need something like:
> 	commit_tid = file->f_flags & __O_SYNC ? EXT4_I(inode)->i_sync_tid :
> 						EXT4_I(inode)->i_datasync_tid;
> 	if (journal->j_flags & JBD2_BARRIER &&
> 	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
> 		needs_barrier = true;
> 	jbd2_log_start_commit(journal, commit_tid);
> 	jbd2_log_wait_commit(journal, commit_tid);
> 	if (needs_barrier)
> 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);

Great, thanks for the pointer!

Cheers,
Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara Feb. 6, 2012, 4:58 p.m. UTC | #3
On Mon 06-02-12 11:20:29, Jeff Moyer wrote:
> Jan Kara <jack@suse.cz> writes:
> 
> >> +	/* workqueue for aio+dio+o_sync disk cache flushing */
> >> +	struct workqueue_struct *aio_dio_flush_wq;
> >> +
> >   Hmm, looking at the patch I'm wondering why did you introduce the new
> > workqueue? It seems dio_unwritten_wq would be enough? You just need to
> > rename it to something more appropriate ;)
> 
> I used a new workqueue as the operations are blocking, and I didn't want
> to hold up other progress.  If you think re-using the unwritten_wq is
> the right thing to do, I'm happy to comply.
  Ah, ok. Thinking about it, it's probably better to use a separate work
queue then.

								Honza
Jeff Moyer Feb. 8, 2012, 3:11 p.m. UTC | #4
Jan Kara <jack@suse.cz> writes:

>   Look at what ext4_sync_file() does. It's more efficient than this.
> You need something like:
> 	commit_tid = file->f_flags & __O_SYNC ? EXT4_I(inode)->i_sync_tid :
> 						EXT4_I(inode)->i_datasync_tid;
> 	if (journal->j_flags & JBD2_BARRIER &&
> 	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
> 		needs_barrier = true;
> 	jbd2_log_start_commit(journal, commit_tid);
> 	jbd2_log_wait_commit(journal, commit_tid);
> 	if (needs_barrier)
> 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);

If the transaction won't send a data barrier, wouldn't you want to issue
the flush on the data device prior to commiting the transaction, not
after it?

-Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara Feb. 13, 2012, 6:27 p.m. UTC | #5
On Wed 08-02-12 10:11:47, Jeff Moyer wrote:
> Jan Kara <jack@suse.cz> writes:
> 
> >   Look at what ext4_sync_file() does. It's more efficient than this.
> > You need something like:
> > 	commit_tid = file->f_flags & __O_SYNC ? EXT4_I(inode)->i_sync_tid :
> > 						EXT4_I(inode)->i_datasync_tid;
> > 	if (journal->j_flags & JBD2_BARRIER &&
> > 	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
> > 		needs_barrier = true;
> > 	jbd2_log_start_commit(journal, commit_tid);
> > 	jbd2_log_wait_commit(journal, commit_tid);
> > 	if (needs_barrier)
> > 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
> 
> If the transaction won't send a data barrier, wouldn't you want to issue
> the flush on the data device prior to commiting the transaction, not
> after it?
  Sorry for late reply. I was thinking about this because the answer isn't
simple... One certain fact is that once ext4_convert_unwritten_extents()
finishes (calls ext4_journal_stop()), the transaction with metadata updates
can commit so whether we place flush before or after jbd2_log_start_commit()
makes no difference. For filesystems where journal is on the filesystem
device, the code should work correctly as is - journalling code will issue
the barrier before transaction commit is done and if there is no
transaction to commit, the place where we issue cache flush does not matter.
But for filesystems where journal is on separate device we indeed need to
issue the flush before the transaction commit is finished so that we
don't expose uninitialized data after crash.

Anyway that's a separate (although related) issue to the one which you fix
in this patch so you can leave the patch as is and I'll fixup the above
problem in a separate patch. Thanks for noticing this!

									Honza
diff mbox

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2d55d7c..4377ed3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -185,6 +185,7 @@  struct mpage_da_data {
 #define EXT4_IO_END_ERROR	0x0002
 #define EXT4_IO_END_QUEUED	0x0004
 #define EXT4_IO_END_DIRECT	0x0008
+#define EXT4_IO_END_NEEDS_SYNC	0x0010
 
 struct ext4_io_page {
 	struct page	*p_page;
@@ -1247,6 +1248,9 @@  struct ext4_sb_info {
 	/* workqueue for dio unwritten */
 	struct workqueue_struct *dio_unwritten_wq;
 
+	/* workqueue for aio+dio+o_sync disk cache flushing */
+	struct workqueue_struct *aio_dio_flush_wq;
+
 	/* timer for periodic error stats printing */
 	struct timer_list s_err_report;
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f6dc02b..13cdb4c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2769,8 +2769,12 @@  static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 
 	iocb->private = NULL;
 
+	/* AIO+DIO+O_SYNC I/Os need a cache flush after completion */
+	if (is_async && (IS_SYNC(inode) || (iocb->ki_filp->f_flags & O_DSYNC)))
+		io_end->flag |= EXT4_IO_END_NEEDS_SYNC;
+
 	/* if not aio dio with unwritten extents, just free io and return */
-	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+	if (!(io_end->flag & (EXT4_IO_END_UNWRITTEN|EXT4_IO_END_NEEDS_SYNC))) {
 		ext4_free_io_end(io_end);
 out:
 		if (is_async)
@@ -2785,7 +2789,10 @@  out:
 		io_end->iocb = iocb;
 		io_end->result = ret;
 	}
-	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+	if (io_end->flag & EXT4_IO_END_UNWRITTEN)
+		wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+	else
+		wq = EXT4_SB(io_end->inode->i_sb)->aio_dio_flush_wq;
 
 	/* Add the io_end to per-inode completed aio dio list*/
 	ei = EXT4_I(io_end->inode);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 9e1b8eb..d07cd40 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -98,15 +98,40 @@  int ext4_end_io_nolock(ext4_io_end_t *io)
 		   "list->prev 0x%p\n",
 		   io, inode->i_ino, io->list.next, io->list.prev);
 
-	ret = ext4_convert_unwritten_extents(inode, offset, size);
-	if (ret < 0) {
-		ext4_msg(inode->i_sb, KERN_EMERG,
-			 "failed to convert unwritten extents to written "
-			 "extents -- potential data loss!  "
-			 "(inode %lu, offset %llu, size %zd, error %d)",
-			 inode->i_ino, offset, size, ret);
+	if (io->flag & EXT4_IO_END_UNWRITTEN) {
+
+		ret = ext4_convert_unwritten_extents(inode, offset, size);
+		if (ret < 0) {
+			ext4_msg(inode->i_sb, KERN_EMERG,
+				 "failed to convert unwritten extents to "
+				 "written extents -- potential data loss!  "
+				 "(inode %lu, offset %llu, size %zd, error %d)",
+				 inode->i_ino, offset, size, ret);
+			goto endio;
+		}
+	}
+
+	/*
+	 * This function has two callers.  The first is the end_io_work
+	 * routine just below.  This is an asynchronous completion context.
+	 * The second is in the fsync path.  For the latter path, we can't
+	 * return from here until the job is done.  Hence, we issue a
+	 * blocking blkdev_issue_flush call.
+	 */
+	if (io->flag & EXT4_IO_END_NEEDS_SYNC) {
+		/*
+		 * Ideally, we'd like to know if the force_commit routine
+		 * actually did send something to disk.  If it didn't,
+		 * then we need to issue the cache flush by hand.  For now,
+		 * play it safe and do both.
+		 */
+		ret = ext4_force_commit(inode->i_sb);
+		if (ret)
+			goto endio;
+		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
 	}
 
+endio:
 	if (io->iocb)
 		aio_complete(io->iocb, io->result, 0);
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 502c61f..a24938e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -805,6 +805,7 @@  static void ext4_put_super(struct super_block *sb)
 	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 
 	flush_workqueue(sbi->dio_unwritten_wq);
+	destroy_workqueue(sbi->aio_dio_flush_wq);
 	destroy_workqueue(sbi->dio_unwritten_wq);
 
 	lock_super(sb);
@@ -3718,6 +3719,13 @@  no_journal:
 		goto failed_mount_wq;
 	}
 
+	EXT4_SB(sb)->aio_dio_flush_wq =
+		alloc_workqueue("ext4-aio-dio-flush", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+	if (!EXT4_SB(sb)->aio_dio_flush_wq) {
+		printk(KERN_ERR "EXT4-fs: failed to create flush workqueue\n");
+		goto failed_flush_wq;
+	}
+
 	/*
 	 * The jbd2_journal_load will have done any necessary log recovery,
 	 * so we can safely mount the rest of the filesystem now.
@@ -3840,6 +3848,8 @@  failed_mount4a:
 	sb->s_root = NULL;
 failed_mount4:
 	ext4_msg(sb, KERN_ERR, "mount failed");
+	destroy_workqueue(EXT4_SB(sb)->aio_dio_flush_wq);
+failed_flush_wq:
 	destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
 failed_mount_wq:
 	if (sbi->s_journal) {
@@ -4303,6 +4313,7 @@  static int ext4_sync_fs(struct super_block *sb, int wait)
 
 	trace_ext4_sync_fs(sb, wait);
 	flush_workqueue(sbi->dio_unwritten_wq);
+	flush_workqueue(sbi->aio_dio_flush_wq);
 	if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
 		if (wait)
 			jbd2_log_wait_commit(sbi->s_journal, target);