Patchwork [1/3] xfs: honor the O_SYNC flag for aysnchronous direct I/O requests

login
register
mail settings
Submitter Jeff Moyer
Date Jan. 27, 2012, 9:15 p.m.
Message ID <1327698949-12616-2-git-send-email-jmoyer@redhat.com>
Download mbox | patch
Permalink /patch/138333/
State New
Headers show

Comments

Jeff Moyer - Jan. 27, 2012, 9:15 p.m.
Hi,

If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
flushed after the write completion.  Instead, it's flushed *before* the
I/O is sent to the disk (in __generic_file_aio_write).  This patch
attempts to fix that problem by marking an I/O as requiring a cache
flush in endio processing.  I'll send a follow-on patch to the
generic write code to get rid of the bogus generic_write_sync call
when EIOCBQUEUED is returned.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
---
 fs/xfs/xfs_aops.c |   69 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_aops.h |    1 +
 fs/xfs/xfs_buf.c  |    9 +++++++
 3 files changed, 77 insertions(+), 2 deletions(-)
Christoph Hellwig - Jan. 28, 2012, 2:59 p.m.
This looks pretty good.  Did this past xfstests?  I'd also like to add
tests actually executing this code path just, to be sure.  E.g. variants
of aio-stress actually using O_SYNC.  We can't easily test data really
made it to disk that way, although at least we make sure the code
doesn't break.

On Fri, Jan 27, 2012 at 04:15:47PM -0500, Jeff Moyer wrote:
> Hi,
> 
> If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
> flushed after the write completion.  Instead, it's flushed *before* the
> I/O is sent to the disk (in __generic_file_aio_write).

XFS doesn't actually use __generic_file_aio_write, so this sentence
isn't correct for XFS.

> +	} else if (xfs_ioend_needs_cache_flush(ioend)) {
> +		struct xfs_inode *ip = XFS_I(ioend->io_inode);
> +		struct xfs_mount *mp = ip->i_mount;
> +		int	err;
> +		int	log_flushed = 0;
> +
> +		/*
> +		 * Check to see if we only need to sync data.  If so,
> +		 * we can skip the log flush.
> +		 */
> +		if (IS_SYNC(ioend->io_inode) ||
> +		    (ioend->io_iocb->ki_filp->f_flags & __O_SYNC)) {

> +			err = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);

Can you add a TODO comment that this actually is synchronous and thus
will block the I/O completion work queue?

Also you can use _xfs_log_force_lsn here as don't need to flush the
whole log, just up to the last lsn that touched the inode.  Copy, or
better factor the code from xfs_dir_fsync for that.

Last but not least this won't catch timestamp updates.  Given that I'm
about to send a series making timestamp updates transaction I would not
recommend you to bother with that, but if you want to take a look
at how xfs_file_fsync deals with them.  Given that this series touches
the same area I'd also like to take your xfs patch in through the xfs tree
to avoid conflicts.

> @@ -47,6 +47,7 @@ STATIC int xfsbufd(void *);
>  static struct workqueue_struct *xfslogd_workqueue;
>  struct workqueue_struct *xfsdatad_workqueue;
>  struct workqueue_struct *xfsconvertd_workqueue;
> +struct workqueue_struct *xfsflushd_workqueue;
>  
>  #ifdef XFS_BUF_LOCK_TRACKING
>  # define XB_SET_OWNER(bp)	((bp)->b_last_holder = current->pid)
> @@ -1802,8 +1803,15 @@ xfs_buf_init(void)
>  	if (!xfsconvertd_workqueue)
>  		goto out_destroy_xfsdatad_workqueue;
>  
> +	xfsflushd_workqueue = alloc_workqueue("xfsflushd",
> +					      WQ_MEM_RECLAIM, 1);

This should allow a higher concurrently level, it's probably a good
idea to pass 0 and use the default.

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 574d4ee..909e020 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -158,6 +158,48 @@  xfs_setfilesize(
 }
 
 /*
+ * In the case of synchronous, AIO, O_DIRECT writes, we need to flush
+ * the disk cache when the I/O is complete.
+ */
+STATIC bool
+xfs_ioend_needs_cache_flush(
+	struct xfs_ioend	*ioend)
+{
+	if (!ioend->io_isasync)
+		return false;
+
+	return (IS_SYNC(ioend->io_inode) ||
+		(ioend->io_iocb->ki_filp->f_flags & O_DSYNC));
+}
+
+STATIC void
+xfs_end_io_flush(
+	struct bio	*bio,
+	int		error)
+{
+	struct xfs_ioend *ioend = bio->bi_private;
+
+	if (error && ioend->io_result > 0)
+		ioend->io_result = error;
+
+	xfs_destroy_ioend(ioend);
+	bio_put(bio);
+}
+
+STATIC void
+xfs_ioend_flush_cache(
+	struct xfs_ioend	*ioend)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_KERNEL, 0);
+	bio->bi_end_io = xfs_end_io_flush;
+	bio->bi_bdev = xfs_find_bdev_for_inode(ioend->io_inode);
+	bio->bi_private = ioend;
+	submit_bio(WRITE_FLUSH, bio);
+}
+
+/*
  * Schedule IO completion handling on the final put of an ioend.
  *
  * If there is no work to do we might as well call it a day and free the
@@ -172,6 +214,8 @@  xfs_finish_ioend(
 			queue_work(xfsconvertd_workqueue, &ioend->io_work);
 		else if (xfs_ioend_is_append(ioend))
 			queue_work(xfsdatad_workqueue, &ioend->io_work);
+		else if (xfs_ioend_needs_cache_flush(ioend))
+			queue_work(xfsflushd_workqueue, &ioend->io_work);
 		else
 			xfs_destroy_ioend(ioend);
 	}
@@ -226,9 +270,30 @@  done:
 		xfs_finish_ioend(ioend);
 		/* ensure we don't spin on blocked ioends */
 		delay(1);
-	} else {
+	} else if (xfs_ioend_needs_cache_flush(ioend)) {
+		struct xfs_inode *ip = XFS_I(ioend->io_inode);
+		struct xfs_mount *mp = ip->i_mount;
+		int	err;
+		int	log_flushed = 0;
+
+		/*
+		 * Check to see if we only need to sync data.  If so,
+		 * we can skip the log flush.
+		 */
+		if (IS_SYNC(ioend->io_inode) ||
+		    (ioend->io_iocb->ki_filp->f_flags & __O_SYNC)) {
+			err = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);
+			if (err && ioend->io_result > 0)
+				ioend->io_result = err;
+			if (err || log_flushed) {
+				xfs_destroy_ioend(ioend);
+				return;
+			}
+		}
+		/* log not flushed or data sync only, flush the disk cache */
+		xfs_ioend_flush_cache(ioend);
+	} else
 		xfs_destroy_ioend(ioend);
-	}
 }
 
 /*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 116dd5c..3f4a1c4 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -20,6 +20,7 @@ 
 
 extern struct workqueue_struct *xfsdatad_workqueue;
 extern struct workqueue_struct *xfsconvertd_workqueue;
+extern struct workqueue_struct *xfsflushd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 
 /*
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4dff85c..39980a8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -47,6 +47,7 @@  STATIC int xfsbufd(void *);
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
 struct workqueue_struct *xfsconvertd_workqueue;
+struct workqueue_struct *xfsflushd_workqueue;
 
 #ifdef XFS_BUF_LOCK_TRACKING
 # define XB_SET_OWNER(bp)	((bp)->b_last_holder = current->pid)
@@ -1802,8 +1803,15 @@  xfs_buf_init(void)
 	if (!xfsconvertd_workqueue)
 		goto out_destroy_xfsdatad_workqueue;
 
+	xfsflushd_workqueue = alloc_workqueue("xfsflushd",
+					      WQ_MEM_RECLAIM, 1);
+	if (!xfsflushd_workqueue)
+		goto out_destroy_xfsconvertd_workqueue;
+
 	return 0;
 
+ out_destroy_xfsconvertd_workqueue:
+	destroy_workqueue(xfsconvertd_workqueue);
  out_destroy_xfsdatad_workqueue:
 	destroy_workqueue(xfsdatad_workqueue);
  out_destroy_xfslogd_workqueue:
@@ -1817,6 +1825,7 @@  xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
+	destroy_workqueue(xfsflushd_workqueue);
 	destroy_workqueue(xfsconvertd_workqueue);
 	destroy_workqueue(xfsdatad_workqueue);
 	destroy_workqueue(xfslogd_workqueue);