Patchwork [4/9] xfs: honor the O_SYNC flag for aysnchronous direct I/O requests

login
register
mail settings
Submitter Darrick J. Wong
Date Nov. 20, 2012, 7:51 a.m.
Message ID <20121120075114.25270.40680.stgit@blackbox.djwong.org>
Download mbox | patch
Permalink /patch/200245/
State New
Headers show

Comments

Darrick J. Wong - Nov. 20, 2012, 7:51 a.m.
If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
flushed after the write completion for AIOs.  This patch attempts to fix
that problem by marking an I/O as requiring a cache flush in endio
processing, and then issuing the cache flush after any unwritten extent
conversion is done.

From: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
[darrick.wong@oracle.com: Rework patch to use per-mount workqueues]
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_aops.c  |   52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_aops.h  |    1 +
 fs/xfs/xfs_mount.h |    1 +
 fs/xfs/xfs_super.c |    8 ++++++++
 4 files changed, 61 insertions(+), 1 deletion(-)




--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara - Nov. 20, 2012, 10:24 a.m.
On Mon 19-11-12 23:51:14, Darrick J. Wong wrote:
> If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
> flushed after the write completion for AIOs.  This patch attempts to fix
> that problem by marking an I/O as requiring a cache flush in endio
> processing, and then issuing the cache flush after any unwritten extent
> conversion is done.
> 
> From: Jeff Moyer <jmoyer@redhat.com>
> Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
> [darrick.wong@oracle.com: Rework patch to use per-mount workqueues]
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/xfs_aops.c  |   52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  fs/xfs/xfs_aops.h  |    1 +
>  fs/xfs/xfs_mount.h |    1 +
>  fs/xfs/xfs_super.c |    8 ++++++++
>  4 files changed, 61 insertions(+), 1 deletion(-)
> 
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index e57e2da..9cebbb7 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -173,6 +173,24 @@ xfs_setfilesize(
>  }
>  
>  /*
> + * In the case of synchronous, AIO, O_DIRECT writes, we need to flush
> + * the disk cache when the I/O is complete.
> + */
> +STATIC bool
> +xfs_ioend_needs_cache_flush(
> +	struct xfs_ioend	*ioend)
> +{
> +	struct xfs_inode *ip = XFS_I(ioend->io_inode);
> +	struct xfs_mount *mp = ip->i_mount;
> +
> +	if (!(mp->m_flags & XFS_MOUNT_BARRIER))
> +		return false;
> +
> +	return IS_SYNC(ioend->io_inode) ||
> +	       (ioend->io_iocb->ki_filp->f_flags & O_DSYNC);
> +}
  I'm not really a XFS developer but calling things "...cache_flush" when
you actually mean fsync is IMHO misleading.

> +
> +/*
>   * Schedule IO completion handling on the final put of an ioend.
>   *
>   * If there is no work to do we might as well call it a day and free the
> @@ -189,11 +207,30 @@ xfs_finish_ioend(
>  			queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
>  		else if (ioend->io_append_trans)
>  			queue_work(mp->m_data_workqueue, &ioend->io_work);
> +		else if (ioend->io_needs_fsync)
> +			queue_work(mp->m_aio_blkdev_flush_wq, &ioend->io_work);
>  		else
>  			xfs_destroy_ioend(ioend);
>  	}
>  }
>  
> +STATIC int
> +xfs_ioend_force_cache_flush(
> +	xfs_ioend_t	*ioend)
> +{
> +	struct xfs_inode *ip = XFS_I(ioend->io_inode);
> +	struct xfs_mount *mp = ip->i_mount;
> +	int		err = 0;
> +	int		datasync;
> +
> +	datasync = !IS_SYNC(ioend->io_inode) &&
> +		!(ioend->io_iocb->ki_filp->f_flags & __O_SYNC);
> +	err = do_xfs_file_fsync(ip, mp, datasync);
> +	xfs_destroy_ioend(ioend);
> +	/* do_xfs_file_fsync returns -errno. our caller expects positive. */
> +	return -err;
> +}
> +
>  /*
>   * IO write completion.
>   */
> @@ -250,12 +287,22 @@ xfs_end_io(
>  		error = xfs_setfilesize(ioend);
>  		if (error)
>  			ioend->io_error = -error;
> +	} else if (ioend->io_needs_fsync) {
> +		error = xfs_ioend_force_cache_flush(ioend);
> +		if (error && ioend->io_result > 0)
> +			ioend->io_error = -error;
> +		ioend->io_needs_fsync = 0;
>  	} else {
>  		ASSERT(!xfs_ioend_is_append(ioend));
>  	}
>  
>  done:
> -	xfs_destroy_ioend(ioend);
> +	/* the honoring of O_SYNC has to be done last */
> +	if (ioend->io_needs_fsync) {
> +		atomic_inc(&ioend->io_remaining);
> +		xfs_finish_ioend(ioend);
> +	} else
> +		xfs_destroy_ioend(ioend);
>  }
  Umm, I don't quite get why you do things the way you do in xfs_end_io().
Why don't you handle fsync there always but offload it instead to workqueue
again in some cases? Is it so that it gets processed in the right workqueue
or why?

								Honza
Dave Chinner - Nov. 20, 2012, 11:20 a.m.
On Mon, Nov 19, 2012 at 11:51:14PM -0800, Darrick J. Wong wrote:
> If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
> flushed after the write completion for AIOs.  This patch attempts to fix
> that problem by marking an I/O as requiring a cache flush in endio
> processing, and then issuing the cache flush after any unwritten extent
> conversion is done.
> 
> From: Jeff Moyer <jmoyer@redhat.com>
> Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
> [darrick.wong@oracle.com: Rework patch to use per-mount workqueues]
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/xfs_aops.c  |   52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  fs/xfs/xfs_aops.h  |    1 +
>  fs/xfs/xfs_mount.h |    1 +
>  fs/xfs/xfs_super.c |    8 ++++++++
>  4 files changed, 61 insertions(+), 1 deletion(-)
> 
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index e57e2da..9cebbb7 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -173,6 +173,24 @@ xfs_setfilesize(
>  }
>  
>  /*
> + * In the case of synchronous, AIO, O_DIRECT writes, we need to flush
> + * the disk cache when the I/O is complete.
> + */

That's not quite right - we need to fsync the metadata when the data
IO is complete. Block device/disk cache flushes are irrelevant at
this level as that is wholly encapsulated inside the metadata fsync
processing.

> +STATIC bool
> +xfs_ioend_needs_cache_flush(

xfs_ioend_need_fsync()

> +	struct xfs_ioend	*ioend)
> +{
> +	struct xfs_inode *ip = XFS_I(ioend->io_inode);
> +	struct xfs_mount *mp = ip->i_mount;
> +
> +	if (!(mp->m_flags & XFS_MOUNT_BARRIER))
> +		return false;

And regardless of whether we have barriers enabled or not, we need
to flush the dirty metadata to the log for O_SYNC/O_DSYNC+AIO+DIO
writes here. So there should be no check of the mount flags here.

> +	return IS_SYNC(ioend->io_inode) ||
> +	       (ioend->io_iocb->ki_filp->f_flags & O_DSYNC);
> +}
> +
> +/*
>   * Schedule IO completion handling on the final put of an ioend.
>   *
>   * If there is no work to do we might as well call it a day and free the
> @@ -189,11 +207,30 @@ xfs_finish_ioend(
>  			queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
>  		else if (ioend->io_append_trans)
>  			queue_work(mp->m_data_workqueue, &ioend->io_work);
> +		else if (ioend->io_needs_fsync)
> +			queue_work(mp->m_aio_blkdev_flush_wq, &ioend->io_work);
>  		else
>  			xfs_destroy_ioend(ioend);
>  	}
>  }
>  
> +STATIC int
> +xfs_ioend_force_cache_flush(

STATIC void
xfs_ioend_fsync()

> +	xfs_ioend_t	*ioend)
> +{
> +	struct xfs_inode *ip = XFS_I(ioend->io_inode);
> +	struct xfs_mount *mp = ip->i_mount;
> +	int		err = 0;
> +	int		datasync;

	trace_xfs_ioend_fsync(ip);

> +	datasync = !IS_SYNC(ioend->io_inode) &&
> +		!(ioend->io_iocb->ki_filp->f_flags & __O_SYNC);
> +	err = do_xfs_file_fsync(ip, mp, datasync);
> +	xfs_destroy_ioend(ioend);

I think this is wrong. The ioend is destroyed by the caller, so
putting it here turns all subsequent uses by the caller into
use-after-free memory corruption bugs.


> +	/* do_xfs_file_fsync returns -errno. our caller expects positive. */
> +	return -err;

This is why xfs_do_file_fsync() should be returning a positive
error - to avoid repeated juggling of error signs.

> +}
> +
>  /*
>   * IO write completion.
>   */
> @@ -250,12 +287,22 @@ xfs_end_io(
>  		error = xfs_setfilesize(ioend);
>  		if (error)
>  			ioend->io_error = -error;
> +	} else if (ioend->io_needs_fsync) {
> +		error = xfs_ioend_force_cache_flush(ioend);
> +		if (error && ioend->io_result > 0)
> +			ioend->io_error = -error;
> +		ioend->io_needs_fsync = 0;

So this is all use-after-free. Also, there's no need to clear
io_needs_fsync() as the ioend is about to be destroyed.

>  	} else {
>  		ASSERT(!xfs_ioend_is_append(ioend));
>  	}
>  
>  done:
> -	xfs_destroy_ioend(ioend);
> +	/* the honoring of O_SYNC has to be done last */
> +	if (ioend->io_needs_fsync) {
> +		atomic_inc(&ioend->io_remaining);
> +		xfs_finish_ioend(ioend);
> +	} else
> +		xfs_destroy_ioend(ioend);

And requeuing work from one workqueue to the next is something that
we can avoid.  We know at IO submission time (i.e.
xfs_vm_direct_io)) whether an fsync completion is going to be needed
during Io completion.  The ioend->io_needs_fsync flag can be set
then, and the first pass through xfs_finish_ioend() can queue it to
the correct workqueue.  i.e. it only needs to be queued if it's not
already an unwritten or append ioend and it needs an fsync.

As it is, all the data completion workqueues run the same completion
function so all you need to do is handle the fsync case at the end
of the existing processing - it's not an else case. i.e the end of
xfs_end_io() becomes:

	if (ioend->io_needs_fsync) {
		error = xfs_ioend_fsync(ioend);
		if (error)
			ioend->io_error = -error;
		goto done;
	}
done:
	xfs_destroy_ioend(ioend);

As it is, this code is going to change before these changes go in -
there's a nasty regression in the DIO code that I found this
afternoon that requires reworking this IO completion logic to
avoid. The patch will appear on the list soon....

> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -209,6 +209,7 @@ typedef struct xfs_mount {
>  	struct workqueue_struct	*m_data_workqueue;
>  	struct workqueue_struct	*m_unwritten_workqueue;
>  	struct workqueue_struct	*m_cil_workqueue;
> +	struct workqueue_struct *m_aio_blkdev_flush_wq;

	struct workqueue_struct *m_aio_fsync_wq;

Cheers,

Dave.
Jeff Moyer - Nov. 20, 2012, 7:42 p.m.
Dave Chinner <david@fromorbit.com> writes:

> And requeuing work from one workqueue to the next is something that
> we can avoid.  We know at IO submission time (i.e.
> xfs_vm_direct_io)) whether an fsync completion is going to be needed
> during Io completion.  The ioend->io_needs_fsync flag can be set
> then, and the first pass through xfs_finish_ioend() can queue it to
> the correct workqueue.  i.e. it only needs to be queued if it's not
> already an unwritten or append ioend and it needs an fsync.
>
> As it is, all the data completion workqueues run the same completion
> function so all you need to do is handle the fsync case at the end
> of the existing processing - it's not an else case. i.e the end of
> xfs_end_io() becomes:
>
> 	if (ioend->io_needs_fsync) {
> 		error = xfs_ioend_fsync(ioend);
> 		if (error)
> 			ioend->io_error = -error;
> 		goto done;
> 	}
> done:
> 	xfs_destroy_ioend(ioend);

Works for me, that makes things simpler.

> As it is, this code is going to change before these changes go in -
> there's a nasty regression in the DIO code that I found this
> afternoon that requires reworking this IO completion logic to
> avoid. The patch will appear on the list soon....

I'm not on the xfs list, so if you haven't already sent it, mind Cc-ing
me?

>> --- a/fs/xfs/xfs_mount.h
>> +++ b/fs/xfs/xfs_mount.h
>> @@ -209,6 +209,7 @@ typedef struct xfs_mount {
>>  	struct workqueue_struct	*m_data_workqueue;
>>  	struct workqueue_struct	*m_unwritten_workqueue;
>>  	struct workqueue_struct	*m_cil_workqueue;
>> +	struct workqueue_struct *m_aio_blkdev_flush_wq;
>
> 	struct workqueue_struct *m_aio_fsync_wq;

For the record, m_aio_blkdev_flush_wq is the name you chose previously.
;-)

Thanks for the review!

Cheers,
Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner - Nov. 20, 2012, 8:08 p.m.
On Tue, Nov 20, 2012 at 02:42:48PM -0500, Jeff Moyer wrote:
> Dave Chinner <david@fromorbit.com> writes:
> 
> > And requeuing work from one workqueue to the next is something that
> > we can avoid.  We know at IO submission time (i.e.
> > xfs_vm_direct_io)) whether an fsync completion is going to be needed
> > during Io completion.  The ioend->io_needs_fsync flag can be set
> > then, and the first pass through xfs_finish_ioend() can queue it to
> > the correct workqueue.  i.e. it only needs to be queued if it's not
> > already an unwritten or append ioend and it needs an fsync.
> >
> > As it is, all the data completion workqueues run the same completion
> > function so all you need to do is handle the fsync case at the end
> > of the existing processing - it's not an else case. i.e the end of
> > xfs_end_io() becomes:
> >
> > 	if (ioend->io_needs_fsync) {
> > 		error = xfs_ioend_fsync(ioend);
> > 		if (error)
> > 			ioend->io_error = -error;
> > 		goto done;
> > 	}
> > done:
> > 	xfs_destroy_ioend(ioend);
> 
> Works for me, that makes things simpler.
> 
> > As it is, this code is going to change before these changes go in -
> > there's a nasty regression in the DIO code that I found this
> > afternoon that requires reworking this IO completion logic to
> > avoid. The patch will appear on the list soon....
> 
> I'm not on the xfs list, so if you haven't already sent it, mind Cc-ing
> me?

http://oss.sgi.com/archives/xfs/2012-11/msg00493.html

First cut is here, but it will change a bit as review goes on...

> >> --- a/fs/xfs/xfs_mount.h
> >> +++ b/fs/xfs/xfs_mount.h
> >> @@ -209,6 +209,7 @@ typedef struct xfs_mount {
> >>  	struct workqueue_struct	*m_data_workqueue;
> >>  	struct workqueue_struct	*m_unwritten_workqueue;
> >>  	struct workqueue_struct	*m_cil_workqueue;
> >> +	struct workqueue_struct *m_aio_blkdev_flush_wq;
> >
> > 	struct workqueue_struct *m_aio_fsync_wq;
> 
> For the record, m_aio_blkdev_flush_wq is the name you chose previously.
> ;-)

<cue Led Zeppelin>

It's been a long time since I read that patch....

:)

Cheers,

Dave....

Patch

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e57e2da..9cebbb7 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -173,6 +173,24 @@  xfs_setfilesize(
 }
 
 /*
+ * In the case of synchronous, AIO, O_DIRECT writes, we need to flush
+ * the disk cache when the I/O is complete.
+ */
+STATIC bool
+xfs_ioend_needs_cache_flush(
+	struct xfs_ioend	*ioend)
+{
+	struct xfs_inode *ip = XFS_I(ioend->io_inode);
+	struct xfs_mount *mp = ip->i_mount;
+
+	if (!(mp->m_flags & XFS_MOUNT_BARRIER))
+		return false;
+
+	return IS_SYNC(ioend->io_inode) ||
+	       (ioend->io_iocb->ki_filp->f_flags & O_DSYNC);
+}
+
+/*
  * Schedule IO completion handling on the final put of an ioend.
  *
  * If there is no work to do we might as well call it a day and free the
@@ -189,11 +207,30 @@  xfs_finish_ioend(
 			queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
 		else if (ioend->io_append_trans)
 			queue_work(mp->m_data_workqueue, &ioend->io_work);
+		else if (ioend->io_needs_fsync)
+			queue_work(mp->m_aio_blkdev_flush_wq, &ioend->io_work);
 		else
 			xfs_destroy_ioend(ioend);
 	}
 }
 
+STATIC int
+xfs_ioend_force_cache_flush(
+	xfs_ioend_t	*ioend)
+{
+	struct xfs_inode *ip = XFS_I(ioend->io_inode);
+	struct xfs_mount *mp = ip->i_mount;
+	int		err = 0;
+	int		datasync;
+
+	datasync = !IS_SYNC(ioend->io_inode) &&
+		!(ioend->io_iocb->ki_filp->f_flags & __O_SYNC);
+	err = do_xfs_file_fsync(ip, mp, datasync);
+	xfs_destroy_ioend(ioend);
+	/* do_xfs_file_fsync returns -errno. our caller expects positive. */
+	return -err;
+}
+
 /*
  * IO write completion.
  */
@@ -250,12 +287,22 @@  xfs_end_io(
 		error = xfs_setfilesize(ioend);
 		if (error)
 			ioend->io_error = -error;
+	} else if (ioend->io_needs_fsync) {
+		error = xfs_ioend_force_cache_flush(ioend);
+		if (error && ioend->io_result > 0)
+			ioend->io_error = -error;
+		ioend->io_needs_fsync = 0;
 	} else {
 		ASSERT(!xfs_ioend_is_append(ioend));
 	}
 
 done:
-	xfs_destroy_ioend(ioend);
+	/* the honoring of O_SYNC has to be done last */
+	if (ioend->io_needs_fsync) {
+		atomic_inc(&ioend->io_remaining);
+		xfs_finish_ioend(ioend);
+	} else
+		xfs_destroy_ioend(ioend);
 }
 
 /*
@@ -292,6 +339,7 @@  xfs_alloc_ioend(
 	atomic_set(&ioend->io_remaining, 1);
 	ioend->io_isasync = 0;
 	ioend->io_isdirect = 0;
+	ioend->io_needs_fsync = 0;
 	ioend->io_error = 0;
 	ioend->io_list = NULL;
 	ioend->io_type = type;
@@ -1409,6 +1457,8 @@  xfs_end_io_direct_write(
 
 	if (is_async) {
 		ioend->io_isasync = 1;
+		if (xfs_ioend_needs_cache_flush(ioend))
+			ioend->io_needs_fsync = 1;
 		xfs_finish_ioend(ioend);
 	} else {
 		xfs_finish_ioend_sync(ioend);
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index c325abb..e48c7c2 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -47,6 +47,7 @@  typedef struct xfs_ioend {
 	atomic_t		io_remaining;	/* hold count */
 	unsigned int		io_isasync : 1;	/* needs aio_complete */
 	unsigned int		io_isdirect : 1;/* direct I/O */
+	unsigned int		io_needs_fsync : 1; /* aio+dio+o_sync */
 	struct inode		*io_inode;	/* file being written to */
 	struct buffer_head	*io_buffer_head;/* buffer linked list head */
 	struct buffer_head	*io_buffer_tail;/* buffer linked list tail */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index deee09e..ecd3d2e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -209,6 +209,7 @@  typedef struct xfs_mount {
 	struct workqueue_struct	*m_data_workqueue;
 	struct workqueue_struct	*m_unwritten_workqueue;
 	struct workqueue_struct	*m_cil_workqueue;
+	struct workqueue_struct *m_aio_blkdev_flush_wq;
 } xfs_mount_t;
 
 /*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 26a09bd..b05b557 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -863,8 +863,15 @@  xfs_init_mount_workqueues(
 			WQ_MEM_RECLAIM, 0, mp->m_fsname);
 	if (!mp->m_cil_workqueue)
 		goto out_destroy_unwritten;
+
+	mp->m_aio_blkdev_flush_wq = alloc_workqueue("xfs-aio-blkdev-flush/%s",
+			WQ_MEM_RECLAIM, 0, mp->m_fsname);
+	if (!mp->m_aio_blkdev_flush_wq)
+		goto out_destroy_cil_queue;
 	return 0;
 
+out_destroy_cil_queue:
+	destroy_workqueue(mp->m_cil_workqueue);
 out_destroy_unwritten:
 	destroy_workqueue(mp->m_unwritten_workqueue);
 out_destroy_data_iodone_queue:
@@ -877,6 +884,7 @@  STATIC void
 xfs_destroy_mount_workqueues(
 	struct xfs_mount	*mp)
 {
+	destroy_workqueue(mp->m_aio_blkdev_flush_wq);
 	destroy_workqueue(mp->m_cil_workqueue);
 	destroy_workqueue(mp->m_data_workqueue);
 	destroy_workqueue(mp->m_unwritten_workqueue);