Patchwork [2/9] ext4: honor the O_SYNC flag for aysnchronous direct I/O requests

login
register
mail settings
Submitter Darrick J. Wong
Date Nov. 20, 2012, 7:41 a.m.
Message ID <20121120074131.24645.38489.stgit@blackbox.djwong.org>
Download mbox | patch
Permalink /patch/200241/
State New
Headers show

Comments

Darrick J. Wong - Nov. 20, 2012, 7:41 a.m.
If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
flushed after the write completion.  Instead, it's flushed *before* the
I/O is sent to the disk (in __generic_file_aio_write).  This patch
attempts to fix that problem by marking an I/O as requiring a cache
flush in endio processing.  I'll send a follow-on patch to the
generic write code to get rid of the bogus generic_write_sync call
when EIOCBQUEUED is returned.

From: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
[darrick.wong@oracle.com: Rework original patch to reflect a subsequent
			  ext4 reorganization]
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/ext4/ext4.h    |    9 +++++
 fs/ext4/file.c    |    2 +
 fs/ext4/inode.c   |    6 +++
 fs/ext4/page-io.c |   92 +++++++++++++++++++++++++++++++++++++++++++++--------
 fs/ext4/super.c   |   13 +++++++
 5 files changed, 106 insertions(+), 16 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara - Nov. 20, 2012, 10:07 a.m.
On Mon 19-11-12 23:41:31, Darrick J. Wong wrote:
> If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
> flushed after the write completion.  Instead, it's flushed *before* the
> I/O is sent to the disk (in __generic_file_aio_write).  This patch
> attempts to fix that problem by marking an I/O as requiring a cache
> flush in endio processing.  I'll send a follow-on patch to the
> generic write code to get rid of the bogus generic_write_sync call
> when EIOCBQUEUED is returned.
> 
> From: Jeff Moyer <jmoyer@redhat.com>
> Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
> [darrick.wong@oracle.com: Rework original patch to reflect a subsequent
> 			  ext4 reorganization]
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/ext4/ext4.h    |    9 +++++
>  fs/ext4/file.c    |    2 +
>  fs/ext4/inode.c   |    6 +++
>  fs/ext4/page-io.c |   92 +++++++++++++++++++++++++++++++++++++++++++++--------
>  fs/ext4/super.c   |   13 +++++++
>  5 files changed, 106 insertions(+), 16 deletions(-)
> 
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 3c20de1..f5a0b89 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -186,6 +186,7 @@ struct mpage_da_data {
>  #define EXT4_IO_END_ERROR	0x0002
>  #define EXT4_IO_END_QUEUED	0x0004
>  #define EXT4_IO_END_DIRECT	0x0008
> +#define EXT4_IO_END_NEEDS_SYNC	0x0010
>  
>  struct ext4_io_page {
>  	struct page	*p_page;
> @@ -1279,6 +1280,9 @@ struct ext4_sb_info {
>  	/* workqueue for dio unwritten */
>  	struct workqueue_struct *dio_unwritten_wq;
>  
> +	/* workqueue for aio+dio+o_sync disk cache flushing */
> +	struct workqueue_struct *aio_dio_flush_wq;
> +
  Umm, I'm not completely decided whether we really need a separate
workqueue. But it doesn't cost too much so I guess it makes some sense -
fsync() is rather heavy so syncing won't starve extent conversion...

> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index 68e896e..cda013a 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -84,6 +84,50 @@ void ext4_free_io_end(ext4_io_end_t *io)
>  	kmem_cache_free(io_end_cachep, io);
>  }
>  
> +/*
> + * This function is called in the completion path for AIO O_SYNC|O_DIRECT
> + * writes, and also in the fsync path.  The purpose is to ensure that the
> + * disk caches for the journal and data devices are flushed.
> + */
> +static int ext4_end_io_do_flush(ext4_io_end_t *io)
> +{
> +	struct inode *inode = io->inode;
> +	tid_t commit_tid;
> +	bool needs_barrier = false;
> +	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
> +	int barriers_enabled = test_opt(inode->i_sb, BARRIER);
> +	int ret = 0;
> +
> +	if (!barriers_enabled)
> +		return 0;
  This is definitely wrong. Even if barriers are disabled, we may need to
push out some buffers or commit a transaction.

> +
> +	/*
> +	 * If we are running in nojournal mode, just flush the disk
> +	 * cache and return.
> +	 */
> +	if (!journal)
> +		return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
  And this is wrong as well - you need to do work similar to what
ext4_sync_file() does. Actually it would be *much* better if these two
sites used the same helper function. Which also poses an interesting
question about locking - do we need i_mutex or not? Forcing a transaction
commit is definitely OK without it, similarly as grabbing transaction ids
from inode or ext4_should_journal_data() test. __sync_inode() call seems
to be OK without i_mutex as well so I believe we can just get rid of it
(getting i_mutex from the workqueue is a locking nightmare we don't want to
return to).

> +
> +	if (ext4_should_journal_data(inode)) {
> +		ret = ext4_force_commit(inode->i_sb);
> +		goto out;
> +	}
> +
> +	commit_tid = io->iocb->ki_filp->f_flags & __O_SYNC ?
> +		EXT4_I(inode)->i_sync_tid : EXT4_I(inode)->i_datasync_tid;
> +	if (!jbd2_trans_will_send_data_barrier(journal, commit_tid))
> +		needs_barrier = true;
> +
> +	jbd2_log_start_commit(journal, commit_tid);
> +	ret = jbd2_log_wait_commit(journal, commit_tid);
> +
> +	if (!ret && needs_barrier)
> +		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
> +
> +out:
> +	return ret;
> +}
> +
>  /* check a range of space and convert unwritten extents to written. */
>  static int ext4_end_io(ext4_io_end_t *io)
>  {
...
> @@ -149,8 +209,11 @@ void ext4_add_complete_io(ext4_io_end_t *io_end)
>  	struct workqueue_struct *wq;
>  	unsigned long flags;
>  
> -	BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
> -	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
> +	BUG_ON(!ext4_io_end_deferred(io_end));
> +	if (io_end->flag & EXT4_IO_END_UNWRITTEN)
> +		wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
> +	else
> +		wq = EXT4_SB(io_end->inode->i_sb)->aio_dio_flush_wq;
  Umm, I'd prefer if we used aio_dio_flush_wq when EXT4_IO_END_NEEDS_SYNC
is set. That way slow syncing works will be always offloaded to a separate
workqueue.

								Honza
Jeff Moyer - Nov. 20, 2012, 8:02 p.m.
Jan Kara <jack@suse.cz> writes:

>> @@ -1279,6 +1280,9 @@ struct ext4_sb_info {
>>  	/* workqueue for dio unwritten */
>>  	struct workqueue_struct *dio_unwritten_wq;
>>  
>> +	/* workqueue for aio+dio+o_sync disk cache flushing */
>> +	struct workqueue_struct *aio_dio_flush_wq;
>> +
>   Umm, I'm not completely decided whether we really need a separate
> workqueue. But it doesn't cost too much so I guess it makes some sense -
> fsync() is rather heavy so syncing won't starve extent conversion...

I'm assuming you'd like me to convert the names from flush to fsync,
yes?

>> +
>> +	/*
>> +	 * If we are running in nojournal mode, just flush the disk
>> +	 * cache and return.
>> +	 */
>> +	if (!journal)
>> +		return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
>   And this is wrong as well - you need to do work similar to what
> ext4_sync_file() does. Actually it would be *much* better if these two
> sites used the same helper function. Which also poses an interesting
> question about locking - do we need i_mutex or not? Forcing a transaction
> commit is definitely OK without it, similarly as grabbing transaction ids
> from inode or ext4_should_journal_data() test. __sync_inode() call seems
> to be OK without i_mutex as well so I believe we can just get rid of it
> (getting i_mutex from the workqueue is a locking nightmare we don't want to
> return to).

Just to be clear, are you saying you would like me to remove the
mutex_lock/unlock pair from ext4_sync_file?  (I had already factored out
the common code between this new code path and the fsync path in my tree.)

>> @@ -149,8 +209,11 @@ void ext4_add_complete_io(ext4_io_end_t *io_end)
>>  	struct workqueue_struct *wq;
>>  	unsigned long flags;
>>  
>> -	BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
>> -	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
>> +	BUG_ON(!ext4_io_end_deferred(io_end));
>> +	if (io_end->flag & EXT4_IO_END_UNWRITTEN)
>> +		wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
>> +	else
>> +		wq = EXT4_SB(io_end->inode->i_sb)->aio_dio_flush_wq;
>   Umm, I'd prefer if we used aio_dio_flush_wq when EXT4_IO_END_NEEDS_SYNC
> is set. That way slow syncing works will be always offloaded to a separate
> workqueue.

OK.

Thanks!
Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3c20de1..f5a0b89 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -186,6 +186,7 @@  struct mpage_da_data {
 #define EXT4_IO_END_ERROR	0x0002
 #define EXT4_IO_END_QUEUED	0x0004
 #define EXT4_IO_END_DIRECT	0x0008
+#define EXT4_IO_END_NEEDS_SYNC	0x0010
 
 struct ext4_io_page {
 	struct page	*p_page;
@@ -1279,6 +1280,9 @@  struct ext4_sb_info {
 	/* workqueue for dio unwritten */
 	struct workqueue_struct *dio_unwritten_wq;
 
+	/* workqueue for aio+dio+o_sync disk cache flushing */
+	struct workqueue_struct *aio_dio_flush_wq;
+
 	/* timer for periodic error stats printing */
 	struct timer_list s_err_report;
 
@@ -1335,6 +1339,11 @@  static inline void ext4_set_io_unwritten_flag(struct inode *inode,
 	}
 }
 
+static inline int ext4_io_end_deferred(ext4_io_end_t *i)
+{
+	return i->flag & (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_NEEDS_SYNC);
+}
+
 static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
 {
 	return inode->i_private;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bf3966b..dd34b81 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -155,7 +155,7 @@  ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
 
-	if (ret > 0 || ret == -EIOCBQUEUED) {
+	if (ret > 0) {
 		ssize_t err;
 
 		err = generic_write_sync(file, pos, ret);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b3c243b..fc4f05e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2893,8 +2893,12 @@  static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 
 	iocb->private = NULL;
 
+	/* AIO+DIO+O_SYNC I/Os need a cache flush after completion */
+	if (is_async && (IS_SYNC(inode) || (iocb->ki_filp->f_flags & O_DSYNC)))
+		io_end->flag |= EXT4_IO_END_NEEDS_SYNC;
+
 	/* if not aio dio with unwritten extents, just free io and return */
-	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+	if (!ext4_io_end_deferred(io_end)) {
 		ext4_free_io_end(io_end);
 out:
 		if (is_async)
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 68e896e..cda013a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -84,6 +84,50 @@  void ext4_free_io_end(ext4_io_end_t *io)
 	kmem_cache_free(io_end_cachep, io);
 }
 
+/*
+ * This function is called in the completion path for AIO O_SYNC|O_DIRECT
+ * writes, and also in the fsync path.  The purpose is to ensure that the
+ * disk caches for the journal and data devices are flushed.
+ */
+static int ext4_end_io_do_flush(ext4_io_end_t *io)
+{
+	struct inode *inode = io->inode;
+	tid_t commit_tid;
+	bool needs_barrier = false;
+	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+	int barriers_enabled = test_opt(inode->i_sb, BARRIER);
+	int ret = 0;
+
+	if (!barriers_enabled)
+		return 0;
+
+	/*
+	 * If we are running in nojournal mode, just flush the disk
+	 * cache and return.
+	 */
+	if (!journal)
+		return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
+
+	if (ext4_should_journal_data(inode)) {
+		ret = ext4_force_commit(inode->i_sb);
+		goto out;
+	}
+
+	commit_tid = io->iocb->ki_filp->f_flags & __O_SYNC ?
+		EXT4_I(inode)->i_sync_tid : EXT4_I(inode)->i_datasync_tid;
+	if (!jbd2_trans_will_send_data_barrier(journal, commit_tid))
+		needs_barrier = true;
+
+	jbd2_log_start_commit(journal, commit_tid);
+	ret = jbd2_log_wait_commit(journal, commit_tid);
+
+	if (!ret && needs_barrier)
+		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
+
+out:
+	return ret;
+}
+
 /* check a range of space and convert unwritten extents to written. */
 static int ext4_end_io(ext4_io_end_t *io)
 {
@@ -96,21 +140,37 @@  static int ext4_end_io(ext4_io_end_t *io)
 		   "list->prev 0x%p\n",
 		   io, inode->i_ino, io->list.next, io->list.prev);
 
-	ret = ext4_convert_unwritten_extents(inode, offset, size);
-	if (ret < 0) {
-		ext4_msg(inode->i_sb, KERN_EMERG,
-			 "failed to convert unwritten extents to written "
-			 "extents -- potential data loss!  "
-			 "(inode %lu, offset %llu, size %zd, error %d)",
-			 inode->i_ino, offset, size, ret);
+	if (io->flag & EXT4_IO_END_UNWRITTEN) {
+		ret = ext4_convert_unwritten_extents(inode, offset, size);
+		if (ret < 0) {
+			ext4_msg(inode->i_sb, KERN_EMERG,
+				 "failed to convert unwritten extents to "
+				 "written extents -- potential data loss!  "
+				 "(inode %lu, offset %llu, size %zd, error %d)",
+				 inode->i_ino, offset, size, ret);
+			goto endio;
+		}
 	}
+
+	/*
+	 * This function has two callers.  The first is the end_io_work
+	 * routine just below, which is an asynchronous completion context.
+	 * The second is in the fsync path.  For the latter path, we can't
+	 * return from here until the job is done.  Hence,
+	 * ext4_end_io_do_flush is blocking.
+	 */
+	if (io->flag & EXT4_IO_END_NEEDS_SYNC)
+		ret = ext4_end_io_do_flush(io);
+
+endio:
 	if (io->iocb)
 		aio_complete(io->iocb, io->result, 0);
 
 	if (io->flag & EXT4_IO_END_DIRECT)
 		inode_dio_done(inode);
 	/* Wake up anyone waiting on unwritten extent conversion */
-	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+	if (io->flag & EXT4_IO_END_UNWRITTEN &&
+	    atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
 		wake_up_all(ext4_ioend_wq(io->inode));
 	return ret;
 }
@@ -149,8 +209,11 @@  void ext4_add_complete_io(ext4_io_end_t *io_end)
 	struct workqueue_struct *wq;
 	unsigned long flags;
 
-	BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
-	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+	BUG_ON(!ext4_io_end_deferred(io_end));
+	if (io_end->flag & EXT4_IO_END_UNWRITTEN)
+		wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+	else
+		wq = EXT4_SB(io_end->inode->i_sb)->aio_dio_flush_wq;
 
 	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
 	if (list_empty(&ei->i_completed_io_list)) {
@@ -180,7 +243,7 @@  static int ext4_do_flush_completed_IO(struct inode *inode,
 
 	while (!list_empty(&unwritten)) {
 		io = list_entry(unwritten.next, ext4_io_end_t, list);
-		BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
+		BUG_ON(!ext4_io_end_deferred(io));
 		list_del_init(&io->list);
 
 		err = ext4_end_io(io);
@@ -192,7 +255,8 @@  static int ext4_do_flush_completed_IO(struct inode *inode,
 	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
 	while (!list_empty(&complete)) {
 		io = list_entry(complete.next, ext4_io_end_t, list);
-		io->flag &= ~EXT4_IO_END_UNWRITTEN;
+		io->flag &= ~(EXT4_IO_END_UNWRITTEN |
+			      EXT4_IO_END_NEEDS_SYNC);
 		/* end_io context can not be destroyed now because it still
 		 * used by queued worker. Worker thread will destroy it later */
 		if (io->flag & EXT4_IO_END_QUEUED)
@@ -204,7 +268,7 @@  static int ext4_do_flush_completed_IO(struct inode *inode,
 	 * flag, and destroy it's end_io if it was converted already */
 	if (work_io) {
 		work_io->flag &= ~EXT4_IO_END_QUEUED;
-		if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
+		if (!ext4_io_end_deferred(work_io))
 			list_add_tail(&work_io->list, &to_free);
 	}
 	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
@@ -319,7 +383,7 @@  static void ext4_end_bio(struct bio *bio, int error)
 			     bi_sector >> (inode->i_blkbits - 9));
 	}
 
-	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+	if (!ext4_io_end_deferred(io_end)) {
 		ext4_free_io_end(io_end);
 		return;
 	}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 80928f7..c657c4d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -849,7 +849,9 @@  static void ext4_put_super(struct super_block *sb)
 	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 
 	flush_workqueue(sbi->dio_unwritten_wq);
+	flush_workqueue(sbi->aio_dio_flush_wq);
 	destroy_workqueue(sbi->dio_unwritten_wq);
+	destroy_workqueue(sbi->aio_dio_flush_wq);
 
 	if (sbi->s_journal) {
 		err = jbd2_journal_destroy(sbi->s_journal);
@@ -3913,6 +3915,14 @@  no_journal:
 		goto failed_mount_wq;
 	}
 
+	EXT4_SB(sb)->aio_dio_flush_wq =
+		alloc_workqueue("ext4-aio-dio-flush",
+				WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+	if (!EXT4_SB(sb)->aio_dio_flush_wq) {
+		pr_err("EXT4-fs: failed to create flush workqueue\n");
+		goto failed_flush_wq;
+	}
+
 	/*
 	 * The jbd2_journal_load will have done any necessary log recovery,
 	 * so we can safely mount the rest of the filesystem now.
@@ -4045,6 +4055,8 @@  failed_mount4a:
 	sb->s_root = NULL;
 failed_mount4:
 	ext4_msg(sb, KERN_ERR, "mount failed");
+	destroy_workqueue(EXT4_SB(sb)->aio_dio_flush_wq);
+failed_flush_wq:
 	destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
 failed_mount_wq:
 	if (sbi->s_journal) {
@@ -4494,6 +4506,7 @@  static int ext4_sync_fs(struct super_block *sb, int wait)
 
 	trace_ext4_sync_fs(sb, wait);
 	flush_workqueue(sbi->dio_unwritten_wq);
+	flush_workqueue(sbi->aio_dio_flush_wq);
 	/*
 	 * Writeback quota in non-journalled quota case - journalled quota has
 	 * no dirty dquots