[v2,5/6] ext4: introduce direct IO write path using iomap infrastructure
diff mbox series

Message ID 7c2f0ee02b2659d5a45f3e30dbee66b443b5ea0a.1567978633.git.mbobrowski@mbobrowski.org
State New
Headers show
Series
  • ext4: port direct IO to iomap infrastructure
Related show

Commit Message

Matthew Bobrowski Sept. 8, 2019, 11:19 p.m. UTC
This patch introduces a new direct IO write code path implementation
that makes use of the iomap infrastructure.

All direct IO write operations are now passed from the ->write_iter()
callback to the new function ext4_dio_write_iter(). This function is
responsible for calling into iomap infrastructure via
iomap_dio_rw(). Snippets of the direct IO code from within
ext4_file_write_iter(), such as checking whether the IO request is
unaligned asynchronous IO, or whether it will ber overwriting
allocated and initialized blocks has been moved out and into
ext4_dio_write_iter().

The block mapping flags that are passed to ext4_map_blocks() from
within ext4_dio_get_block() and friends have effectively been taken
out and introduced within the ext4_iomap_begin(). If ext4_map_blocks()
happens to have instantiated blocks beyond the i_size, then we attempt
to place the inode onto the orphan list. Despite being able to perform
i_size extension checking earlier on in the direct IO code path, it
makes most sense to perform this bit post successful block allocation.

The ->end_io() callback ext4_dio_write_end_io() is responsible for
removing the inode from the orphan list and determining if we should
truncate a failed write in the case of an error. We also convert a
range of unwritten extents to written if IOMAP_DIO_UNWRITTEN is set
and perform the necessary i_size/i_disksize extension if the
iocb->ki_pos + dio->size > i_size_read(inode).

In the instance of a short write, we fallback to buffered IO and
complete whatever is left the 'iter'. Any blocks that may have been
allocated in preparation for direct IO will be reused by buffered IO,
so there's no issue with leaving allocated blocks beyond EOF.

Signed-off-by: Matthew Bobrowski <mbobrowski@mbobrowski.org>
---
 fs/ext4/file.c  | 219 +++++++++++++++++++++++++++++++++---------------
 fs/ext4/inode.c |  57 ++++++++++---
 2 files changed, 198 insertions(+), 78 deletions(-)

Comments

Ritesh Harjani Sept. 9, 2019, 9:20 a.m. UTC | #1
On 9/9/19 4:49 AM, Matthew Bobrowski wrote:
> This patch introduces a new direct IO write code path implementation
> that makes use of the iomap infrastructure.
> 
> All direct IO write operations are now passed from the ->write_iter()
> callback to the new function ext4_dio_write_iter(). This function is
> responsible for calling into iomap infrastructure via
> iomap_dio_rw(). Snippets of the direct IO code from within
> ext4_file_write_iter(), such as checking whether the IO request is
> unaligned asynchronous IO, or whether it will ber overwriting
> allocated and initialized blocks has been moved out and into
> ext4_dio_write_iter().
> 
> The block mapping flags that are passed to ext4_map_blocks() from
> within ext4_dio_get_block() and friends have effectively been taken
> out and introduced within the ext4_iomap_begin(). If ext4_map_blocks()
> happens to have instantiated blocks beyond the i_size, then we attempt
> to place the inode onto the orphan list. Despite being able to perform
> i_size extension checking earlier on in the direct IO code path, it
> makes most sense to perform this bit post successful block allocation.
> 
> The ->end_io() callback ext4_dio_write_end_io() is responsible for
> removing the inode from the orphan list and determining if we should
> truncate a failed write in the case of an error. We also convert a
> range of unwritten extents to written if IOMAP_DIO_UNWRITTEN is set
> and perform the necessary i_size/i_disksize extension if the
> iocb->ki_pos + dio->size > i_size_read(inode).
> 
> In the instance of a short write, we fallback to buffered IO and
> complete whatever is left the 'iter'. Any blocks that may have been
> allocated in preparation for direct IO will be reused by buffered IO,
> so there's no issue with leaving allocated blocks beyond EOF.
> 
> Signed-off-by: Matthew Bobrowski <mbobrowski@mbobrowski.org>

Looks good to me.

Reviewed-by: Ritesh Harjani <riteshh@linux.ibm.com>

> ---
>   fs/ext4/file.c  | 219 +++++++++++++++++++++++++++++++++---------------
>   fs/ext4/inode.c |  57 ++++++++++---
>   2 files changed, 198 insertions(+), 78 deletions(-)
> 
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 8e586198f6e6..bf22425a6a6f 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -29,6 +29,7 @@
>   #include <linux/pagevec.h>
>   #include <linux/uio.h>
>   #include <linux/mman.h>
> +#include <linux/backing-dev.h>
>   #include "ext4.h"
>   #include "ext4_jbd2.h"
>   #include "xattr.h"
> @@ -217,6 +218,14 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>   	if (ret <= 0)
>   		return ret;
> 
> +	ret = file_remove_privs(iocb->ki_filp);
> +	if (ret)
> +		return 0;
> +
> +	ret = file_update_time(iocb->ki_filp);
> +	if (ret)
> +		return 0;
> +
>   	if (unlikely(IS_IMMUTABLE(inode)))
>   		return -EPERM;
> 
> @@ -234,6 +243,34 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>   	return iov_iter_count(from);
>   }
> 
> +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
> +					struct iov_iter *from)
> +{
> +	ssize_t ret;
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +
> +	if (iocb->ki_flags & IOCB_NOWAIT)
> +		return -EOPNOTSUPP;
> +
> +	if (!inode_trylock(inode))
> +		inode_lock(inode);
> +
> +	ret = ext4_write_checks(iocb, from);
> +	if (ret <= 0)
> +		goto out;
> +
> +	current->backing_dev_info = inode_to_bdi(inode);
> +	ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
> +	current->backing_dev_info = NULL;
> +out:
> +	inode_unlock(inode);
> +	if (likely(ret > 0)) {
> +		iocb->ki_pos += ret;
> +		ret = generic_write_sync(iocb, ret);
> +	}
> +	return ret;
> +}
> +
>   static int ext4_handle_inode_extension(struct inode *inode, loff_t offset,
>   				       ssize_t len, size_t count)
>   {
> @@ -311,6 +348,118 @@ static int ext4_handle_failed_inode_extension(struct inode *inode, loff_t size)
>   	return ret;
>   }
> 
> +/*
> + * For a write that extends the inode size, ext4_dio_write_iter() will
> + * wait for the write to complete. Consequently, operations performed
> + * within this function are still covered by the inode_lock().
> + */
> +static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error,
> +				 unsigned int flags)
> +{
> +	int ret = 0;
> +	loff_t offset = iocb->ki_pos;
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +
> +	if (error) {
> +		ret = ext4_handle_failed_inode_extension(inode, offset + size);
> +		return ret ? ret : error;
> +	}
> +
> +	if (flags & IOMAP_DIO_UNWRITTEN) {
> +		ret = ext4_convert_unwritten_extents(NULL, inode,
> +						     offset, size);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	if (offset + size > i_size_read(inode)) {
> +		ret = ext4_handle_inode_extension(inode, offset, size, 0);
> +		if (ret)
> +			return ret;
> +	}
> +	return ret;
> +}
> +
> +static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	ssize_t ret;
> +	loff_t offset = iocb->ki_pos;
> +	size_t count = iov_iter_count(from);
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	bool extend = false, overwrite = false, unaligned_aio = false;
> +
> +	if (!inode_trylock(inode)) {
> +		if (iocb->ki_flags & IOCB_NOWAIT)
> +			return -EAGAIN;
> +		inode_lock(inode);
> +	}
> +
> +	if (!ext4_dio_checks(inode)) {
> +		inode_unlock(inode);
> +		/*
> +		 * Fallback to buffered IO if the operation on the
> +		 * inode is not supported by direct IO.
> +		 */
> +		return ext4_buffered_write_iter(iocb, from);
> +	}
> +
> +	ret = ext4_write_checks(iocb, from);
> +	if (ret <= 0) {
> +		inode_unlock(inode);
> +		return ret;
> +	}
> +
> +	/*
> +	 * Unaligned direct AIO must be serialized among each other as
> +	 * the zeroing of partial blocks of two competing unaligned
> +	 * AIOs can result in data corruption.
> +	 */
> +	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
> +	    !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
> +		unaligned_aio = true;
> +		inode_dio_wait(inode);
> +	}
> +
> +	/*
> +	 * Determine whether the IO operation will overwrite allocated
> +	 * and initialized blocks. If so, check to see whether it is
> +	 * possible to take the dioread_nolock path.
> +	 */
> +	if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
> +	    ext4_should_dioread_nolock(inode)) {
> +		overwrite = true;
> +		downgrade_write(&inode->i_rwsem);
> +	}
> +
> +	if (offset + count > i_size_read(inode) ||
> +	    offset + count > EXT4_I(inode)->i_disksize) {
> +		ext4_update_i_disksize(inode, inode->i_size);
> +		extend = true;
> +	}
> +
> +	ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, ext4_dio_write_end_io);
> +
> +	/*
> +	 * Unaligned direct AIO must be the only IO in flight or else
> +	 * any overlapping aligned IO after unaligned IO might result
> +	 * in data corruption. We also need to wait here in the case
> +	 * where the inode is being extended so that inode extension
> +	 * routines in ext4_dio_write_end_io() are covered by the
> +	 * inode_lock().
> +	 */
> +	if (ret == -EIOCBQUEUED && (unaligned_aio || extend))
> +		inode_dio_wait(inode);
> +
> +	if (overwrite)
> +		inode_unlock_shared(inode);
> +	else
> +		inode_unlock(inode);
> +
> +	if (ret >= 0 && iov_iter_count(from))
> +		return ext4_buffered_write_iter(iocb, from);
> +	return ret;
> +}
> +
>   #ifdef CONFIG_FS_DAX
>   static ssize_t
>   ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
> @@ -325,15 +474,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
>   			return -EAGAIN;
>   		inode_lock(inode);
>   	}
> +
>   	ret = ext4_write_checks(iocb, from);
>   	if (ret <= 0)
>   		goto out;
> -	ret = file_remove_privs(iocb->ki_filp);
> -	if (ret)
> -		goto out;
> -	ret = file_update_time(iocb->ki_filp);
> -	if (ret)
> -		goto out;
> 
>   	offset = iocb->ki_pos;
>   	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
> @@ -359,73 +503,16 @@ static ssize_t
>   ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>   {
>   	struct inode *inode = file_inode(iocb->ki_filp);
> -	int o_direct = iocb->ki_flags & IOCB_DIRECT;
> -	int unaligned_aio = 0;
> -	int overwrite = 0;
> -	ssize_t ret;
> 
>   	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
>   		return -EIO;
> 
> -#ifdef CONFIG_FS_DAX
>   	if (IS_DAX(inode))
>   		return ext4_dax_write_iter(iocb, from);
> -#endif
> -	if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
> -		return -EOPNOTSUPP;
> 
> -	if (!inode_trylock(inode)) {
> -		if (iocb->ki_flags & IOCB_NOWAIT)
> -			return -EAGAIN;
> -		inode_lock(inode);
> -	}
> -
> -	ret = ext4_write_checks(iocb, from);
> -	if (ret <= 0)
> -		goto out;
> -
> -	/*
> -	 * Unaligned direct AIO must be serialized among each other as zeroing
> -	 * of partial blocks of two competing unaligned AIOs can result in data
> -	 * corruption.
> -	 */
> -	if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
> -	    !is_sync_kiocb(iocb) &&
> -	    ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
> -		unaligned_aio = 1;
> -		ext4_unwritten_wait(inode);
> -	}
> -
> -	iocb->private = &overwrite;
> -	/* Check whether we do a DIO overwrite or not */
> -	if (o_direct && !unaligned_aio) {
> -		if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
> -			if (ext4_should_dioread_nolock(inode))
> -				overwrite = 1;
> -		} else if (iocb->ki_flags & IOCB_NOWAIT) {
> -			ret = -EAGAIN;
> -			goto out;
> -		}
> -	}
> -
> -	ret = __generic_file_write_iter(iocb, from);
> -	/*
> -	 * Unaligned direct AIO must be the only IO in flight. Otherwise
> -	 * overlapping aligned IO after unaligned might result in data
> -	 * corruption.
> -	 */
> -	if (ret == -EIOCBQUEUED && unaligned_aio)
> -		ext4_unwritten_wait(inode);
> -	inode_unlock(inode);
> -
> -	if (ret > 0)
> -		ret = generic_write_sync(iocb, ret);
> -
> -	return ret;
> -
> -out:
> -	inode_unlock(inode);
> -	return ret;
> +	if (iocb->ki_flags & IOCB_DIRECT)
> +		return ext4_dio_write_iter(iocb, from);
> +	return ext4_buffered_write_iter(iocb, from);
>   }
> 
>   #ifdef CONFIG_FS_DAX
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index efb184928e51..f52ad3065236 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3513,11 +3513,13 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>   			}
>   		}
>   	} else if (flags & IOMAP_WRITE) {
> -		int dio_credits;
>   		handle_t *handle;
> -		int retries = 0;
> +		int dio_credits, retries = 0, m_flags = 0;
> 
> -		/* Trim mapping request to maximum we can map at once for DIO */
> +		/*
> +		 * Trim mapping request to maximum we can map at once
> +		 * for DIO.
> +		 */
>   		if (map.m_len > DIO_MAX_BLOCKS)
>   			map.m_len = DIO_MAX_BLOCKS;
>   		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
> @@ -3533,8 +3535,30 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>   		if (IS_ERR(handle))
>   			return PTR_ERR(handle);
> 
> -		ret = ext4_map_blocks(handle, inode, &map,
> -				      EXT4_GET_BLOCKS_CREATE_ZERO);
> +		/*
> +		 * DAX and direct IO are the only two operations that
> +		 * are currently supported with IOMAP_WRITE.
> +		 */
> +		WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
> +		if (IS_DAX(inode))
> +			m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
> +		else if (round_down(offset, i_blocksize(inode)) >=
> +			 i_size_read(inode))
> +			m_flags = EXT4_GET_BLOCKS_CREATE;
> +		else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> +			m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
> +
> +		ret = ext4_map_blocks(handle, inode, &map, m_flags);
> +
> +		/*
> +		 * We cannot fill holes in indirect tree based inodes
> +		 * as that could expose stale data in the case of a
> +		 * crash. Use the magic error code to fallback to
> +		 * buffered IO.
> +		 */
> +		if (!m_flags && !ret)
> +			ret = -ENOTBLK;
> +
>   		if (ret < 0) {
>   			ext4_journal_stop(handle);
>   			if (ret == -ENOSPC &&
> @@ -3544,13 +3568,14 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>   		}
> 
>   		/*
> -		 * If we added blocks beyond i_size, we need to make sure they
> -		 * will get truncated if we crash before updating i_size in
> -		 * ext4_iomap_end(). For faults we don't need to do that (and
> -		 * even cannot because for orphan list operations inode_lock is
> -		 * required) - if we happen to instantiate block beyond i_size,
> -		 * it is because we race with truncate which has already added
> -		 * the inode to the orphan list.
> +		 * If we added blocks beyond i_size, we need to make
> +		 * sure they will get truncated if we crash before
> +		 * updating the i_size. For faults we don't need to do
> +		 * that (and even cannot because for orphan list
> +		 * operations inode_lock is required) - if we happen
> +		 * to instantiate block beyond i_size, it is because
> +		 * we race with truncate which has already added the
> +		 * inode to the orphan list.
>   		 */
>   		if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
>   		    (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
> @@ -3612,6 +3637,14 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>   static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
>   			  ssize_t written, unsigned flags, struct iomap *iomap)
>   {
> +	/*
> +	 * Check to see whether an error occurred while writing data
> +	 * out to allocated blocks. If so, return the magic error code
> +	 * so that we fallback to buffered IO and reuse the blocks
> +	 * that were allocated in preparation for the direct IO write.
> +	 */
> +	if (flags & IOMAP_DIRECT && written == 0)
> +		return -ENOTBLK;
>   	return 0;
>   }
>
Ritesh Harjani Sept. 9, 2019, 9:26 a.m. UTC | #2
On 9/9/19 4:49 AM, Matthew Bobrowski wrote:
> This patch introduces a new direct IO write code path implementation
> that makes use of the iomap infrastructure.
> 
> All direct IO write operations are now passed from the ->write_iter()
> callback to the new function ext4_dio_write_iter(). This function is
> responsible for calling into iomap infrastructure via
> iomap_dio_rw(). Snippets of the direct IO code from within
> ext4_file_write_iter(), such as checking whether the IO request is
> unaligned asynchronous IO, or whether it will ber overwriting
> allocated and initialized blocks has been moved out and into
> ext4_dio_write_iter().
> 
> The block mapping flags that are passed to ext4_map_blocks() from
> within ext4_dio_get_block() and friends have effectively been taken
> out and introduced within the ext4_iomap_begin(). If ext4_map_blocks()
> happens to have instantiated blocks beyond the i_size, then we attempt
> to place the inode onto the orphan list. Despite being able to perform
> i_size extension checking earlier on in the direct IO code path, it
> makes most sense to perform this bit post successful block allocation.
> 
> The ->end_io() callback ext4_dio_write_end_io() is responsible for
> removing the inode from the orphan list and determining if we should
> truncate a failed write in the case of an error. We also convert a
> range of unwritten extents to written if IOMAP_DIO_UNWRITTEN is set
> and perform the necessary i_size/i_disksize extension if the
> iocb->ki_pos + dio->size > i_size_read(inode).
> 
> In the instance of a short write, we fallback to buffered IO and
> complete whatever is left the 'iter'. Any blocks that may have been
> allocated in preparation for direct IO will be reused by buffered IO,
> so there's no issue with leaving allocated blocks beyond EOF.
> 
> Signed-off-by: Matthew Bobrowski <mbobrowski@mbobrowski.org>

Sorry some minor simplification comments. Forgot to respond in previous 
email.

Otherwise looks good.

Reviewed-by: Ritesh Harjani <riteshh@linux.ibm.com>



> ---
>   fs/ext4/file.c  | 219 +++++++++++++++++++++++++++++++++---------------
>   fs/ext4/inode.c |  57 ++++++++++---
>   2 files changed, 198 insertions(+), 78 deletions(-)
> 
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 8e586198f6e6..bf22425a6a6f 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -29,6 +29,7 @@
>   #include <linux/pagevec.h>
>   #include <linux/uio.h>
>   #include <linux/mman.h>
> +#include <linux/backing-dev.h>
>   #include "ext4.h"
>   #include "ext4_jbd2.h"
>   #include "xattr.h"
> @@ -217,6 +218,14 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>   	if (ret <= 0)
>   		return ret;
> 
> +	ret = file_remove_privs(iocb->ki_filp);
> +	if (ret)
> +		return 0;
> +
> +	ret = file_update_time(iocb->ki_filp);
> +	if (ret)
> +		return 0;
> +
>   	if (unlikely(IS_IMMUTABLE(inode)))
>   		return -EPERM;
> 
> @@ -234,6 +243,34 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>   	return iov_iter_count(from);
>   }
> 
> +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
> +					struct iov_iter *from)
> +{
> +	ssize_t ret;
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +
> +	if (iocb->ki_flags & IOCB_NOWAIT)
> +		return -EOPNOTSUPP;
> +
> +	if (!inode_trylock(inode))
> +		inode_lock(inode);
> +
> +	ret = ext4_write_checks(iocb, from);
> +	if (ret <= 0)
> +		goto out;
> +
> +	current->backing_dev_info = inode_to_bdi(inode);
> +	ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
> +	current->backing_dev_info = NULL;
> +out:
> +	inode_unlock(inode);
> +	if (likely(ret > 0)) {
> +		iocb->ki_pos += ret;
> +		ret = generic_write_sync(iocb, ret);
> +	}
> +	return ret;
> +}
> +
>   static int ext4_handle_inode_extension(struct inode *inode, loff_t offset,
>   				       ssize_t len, size_t count)
>   {
> @@ -311,6 +348,118 @@ static int ext4_handle_failed_inode_extension(struct inode *inode, loff_t size)
>   	return ret;
>   }
> 
> +/*
> + * For a write that extends the inode size, ext4_dio_write_iter() will
> + * wait for the write to complete. Consequently, operations performed
> + * within this function are still covered by the inode_lock().
> + */
Maybe add a comment that on success this returns 0.

> +static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error,
> +				 unsigned int flags)
> +{
> +	int ret = 0;
No need to initialize ret.


> +	loff_t offset = iocb->ki_pos;
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +
> +	if (error) {
> +		ret = ext4_handle_failed_inode_extension(inode, offset + size);
> +		return ret ? ret : error;
> +	}
> +
> +	if (flags & IOMAP_DIO_UNWRITTEN) {
> +		ret = ext4_convert_unwritten_extents(NULL, inode,
> +						     offset, size);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	if (offset + size > i_size_read(inode)) {
> +		ret = ext4_handle_inode_extension(inode, offset, size, 0);
> +		if (ret)
> +			return ret;
> +	}
> +	return ret;
Directly return 0, since if it falls here it mans it is a success case.
You are anyway returning error from above error paths.

> +}
> +
> +static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	ssize_t ret;
> +	loff_t offset = iocb->ki_pos;
> +	size_t count = iov_iter_count(from);
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	bool extend = false, overwrite = false, unaligned_aio = false;
> +
> +	if (!inode_trylock(inode)) {
> +		if (iocb->ki_flags & IOCB_NOWAIT)
> +			return -EAGAIN;
> +		inode_lock(inode);
> +	}
> +
> +	if (!ext4_dio_checks(inode)) {
> +		inode_unlock(inode);
> +		/*
> +		 * Fallback to buffered IO if the operation on the
> +		 * inode is not supported by direct IO.
> +		 */
> +		return ext4_buffered_write_iter(iocb, from);
> +	}
> +
> +	ret = ext4_write_checks(iocb, from);
> +	if (ret <= 0) {
> +		inode_unlock(inode);
> +		return ret;
> +	}
> +
> +	/*
> +	 * Unaligned direct AIO must be serialized among each other as
> +	 * the zeroing of partial blocks of two competing unaligned
> +	 * AIOs can result in data corruption.
> +	 */
> +	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
> +	    !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
> +		unaligned_aio = true;
> +		inode_dio_wait(inode);
> +	}
> +
> +	/*
> +	 * Determine whether the IO operation will overwrite allocated
> +	 * and initialized blocks. If so, check to see whether it is
> +	 * possible to take the dioread_nolock path.
> +	 */
> +	if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
> +	    ext4_should_dioread_nolock(inode)) {
> +		overwrite = true;
> +		downgrade_write(&inode->i_rwsem);
> +	}
> +
> +	if (offset + count > i_size_read(inode) ||
> +	    offset + count > EXT4_I(inode)->i_disksize) {
> +		ext4_update_i_disksize(inode, inode->i_size);
> +		extend = true;
> +	}
> +
> +	ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, ext4_dio_write_end_io);
> +
> +	/*
> +	 * Unaligned direct AIO must be the only IO in flight or else
> +	 * any overlapping aligned IO after unaligned IO might result
> +	 * in data corruption. We also need to wait here in the case
> +	 * where the inode is being extended so that inode extension
> +	 * routines in ext4_dio_write_end_io() are covered by the
> +	 * inode_lock().
> +	 */
> +	if (ret == -EIOCBQUEUED && (unaligned_aio || extend))
> +		inode_dio_wait(inode);
> +
> +	if (overwrite)
> +		inode_unlock_shared(inode);
> +	else
> +		inode_unlock(inode);
> +
> +	if (ret >= 0 && iov_iter_count(from))
> +		return ext4_buffered_write_iter(iocb, from);
> +	return ret;
> +}
> +
>   #ifdef CONFIG_FS_DAX
>   static ssize_t
>   ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
> @@ -325,15 +474,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
>   			return -EAGAIN;
>   		inode_lock(inode);
>   	}
> +
>   	ret = ext4_write_checks(iocb, from);
>   	if (ret <= 0)
>   		goto out;
> -	ret = file_remove_privs(iocb->ki_filp);
> -	if (ret)
> -		goto out;
> -	ret = file_update_time(iocb->ki_filp);
> -	if (ret)
> -		goto out;
> 
>   	offset = iocb->ki_pos;
>   	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
> @@ -359,73 +503,16 @@ static ssize_t
>   ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>   {
>   	struct inode *inode = file_inode(iocb->ki_filp);
> -	int o_direct = iocb->ki_flags & IOCB_DIRECT;
> -	int unaligned_aio = 0;
> -	int overwrite = 0;
> -	ssize_t ret;
> 
>   	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
>   		return -EIO;
> 
> -#ifdef CONFIG_FS_DAX
>   	if (IS_DAX(inode))
>   		return ext4_dax_write_iter(iocb, from);
> -#endif
> -	if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
> -		return -EOPNOTSUPP;
> 
> -	if (!inode_trylock(inode)) {
> -		if (iocb->ki_flags & IOCB_NOWAIT)
> -			return -EAGAIN;
> -		inode_lock(inode);
> -	}
> -
> -	ret = ext4_write_checks(iocb, from);
> -	if (ret <= 0)
> -		goto out;
> -
> -	/*
> -	 * Unaligned direct AIO must be serialized among each other as zeroing
> -	 * of partial blocks of two competing unaligned AIOs can result in data
> -	 * corruption.
> -	 */
> -	if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
> -	    !is_sync_kiocb(iocb) &&
> -	    ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
> -		unaligned_aio = 1;
> -		ext4_unwritten_wait(inode);
> -	}
> -
> -	iocb->private = &overwrite;
> -	/* Check whether we do a DIO overwrite or not */
> -	if (o_direct && !unaligned_aio) {
> -		if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
> -			if (ext4_should_dioread_nolock(inode))
> -				overwrite = 1;
> -		} else if (iocb->ki_flags & IOCB_NOWAIT) {
> -			ret = -EAGAIN;
> -			goto out;
> -		}
> -	}
> -
> -	ret = __generic_file_write_iter(iocb, from);
> -	/*
> -	 * Unaligned direct AIO must be the only IO in flight. Otherwise
> -	 * overlapping aligned IO after unaligned might result in data
> -	 * corruption.
> -	 */
> -	if (ret == -EIOCBQUEUED && unaligned_aio)
> -		ext4_unwritten_wait(inode);
> -	inode_unlock(inode);
> -
> -	if (ret > 0)
> -		ret = generic_write_sync(iocb, ret);
> -
> -	return ret;
> -
> -out:
> -	inode_unlock(inode);
> -	return ret;
> +	if (iocb->ki_flags & IOCB_DIRECT)
> +		return ext4_dio_write_iter(iocb, from);
> +	return ext4_buffered_write_iter(iocb, from);
>   }
> 
>   #ifdef CONFIG_FS_DAX
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index efb184928e51..f52ad3065236 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3513,11 +3513,13 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>   			}
>   		}
>   	} else if (flags & IOMAP_WRITE) {
> -		int dio_credits;
>   		handle_t *handle;
> -		int retries = 0;
> +		int dio_credits, retries = 0, m_flags = 0;
> 
> -		/* Trim mapping request to maximum we can map at once for DIO */
> +		/*
> +		 * Trim mapping request to maximum we can map at once
> +		 * for DIO.
> +		 */
>   		if (map.m_len > DIO_MAX_BLOCKS)
>   			map.m_len = DIO_MAX_BLOCKS;
>   		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
> @@ -3533,8 +3535,30 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>   		if (IS_ERR(handle))
>   			return PTR_ERR(handle);
> 
> -		ret = ext4_map_blocks(handle, inode, &map,
> -				      EXT4_GET_BLOCKS_CREATE_ZERO);
> +		/*
> +		 * DAX and direct IO are the only two operations that
> +		 * are currently supported with IOMAP_WRITE.
> +		 */
> +		WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
> +		if (IS_DAX(inode))
> +			m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
> +		else if (round_down(offset, i_blocksize(inode)) >=
> +			 i_size_read(inode))
> +			m_flags = EXT4_GET_BLOCKS_CREATE;
> +		else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> +			m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
> +
> +		ret = ext4_map_blocks(handle, inode, &map, m_flags);
> +
> +		/*
> +		 * We cannot fill holes in indirect tree based inodes
> +		 * as that could expose stale data in the case of a
> +		 * crash. Use the magic error code to fallback to
> +		 * buffered IO.
> +		 */

I like this comment ;)
Help others to understand what is really going on here.

> +		if (!m_flags && !ret)
> +			ret = -ENOTBLK;
> +
>   		if (ret < 0) {
>   			ext4_journal_stop(handle);
>   			if (ret == -ENOSPC &&
> @@ -3544,13 +3568,14 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>   		}
> 
>   		/*
> -		 * If we added blocks beyond i_size, we need to make sure they
> -		 * will get truncated if we crash before updating i_size in
> -		 * ext4_iomap_end(). For faults we don't need to do that (and
> -		 * even cannot because for orphan list operations inode_lock is
> -		 * required) - if we happen to instantiate block beyond i_size,
> -		 * it is because we race with truncate which has already added
> -		 * the inode to the orphan list.
> +		 * If we added blocks beyond i_size, we need to make
> +		 * sure they will get truncated if we crash before
> +		 * updating the i_size. For faults we don't need to do
> +		 * that (and even cannot because for orphan list
> +		 * operations inode_lock is required) - if we happen
> +		 * to instantiate block beyond i_size, it is because
> +		 * we race with truncate which has already added the
> +		 * inode to the orphan list.
>   		 */
>   		if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
>   		    (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
> @@ -3612,6 +3637,14 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>   static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
>   			  ssize_t written, unsigned flags, struct iomap *iomap)
>   {
> +	/*
> +	 * Check to see whether an error occurred while writing data
> +	 * out to allocated blocks. If so, return the magic error code
> +	 * so that we fallback to buffered IO and reuse the blocks
> +	 * that were allocated in preparation for the direct IO write.
> +	 */
> +	if (flags & IOMAP_DIRECT && written == 0)
> +		return -ENOTBLK;
>   	return 0;
>   }
>
Ritesh Harjani Sept. 9, 2019, 2:32 p.m. UTC | #3
Please excuse me for my queries here again.
Hoping my query here will be useful for others also to understand
about what is happening under the hood here (for ext4 & XFS) :)


On 9/9/19 2:56 PM, Ritesh Harjani wrote:
> 
> 
> On 9/9/19 4:49 AM, Matthew Bobrowski wrote:
>> This patch introduces a new direct IO write code path implementation
>> that makes use of the iomap infrastructure.
>>
>> All direct IO write operations are now passed from the ->write_iter()
>> callback to the new function ext4_dio_write_iter(). This function is
>> responsible for calling into iomap infrastructure via
>> iomap_dio_rw(). Snippets of the direct IO code from within
>> ext4_file_write_iter(), such as checking whether the IO request is
>> unaligned asynchronous IO, or whether it will ber overwriting
>> allocated and initialized blocks has been moved out and into
>> ext4_dio_write_iter().
>>
>> The block mapping flags that are passed to ext4_map_blocks() from
>> within ext4_dio_get_block() and friends have effectively been taken
>> out and introduced within the ext4_iomap_begin(). If ext4_map_blocks()
>> happens to have instantiated blocks beyond the i_size, then we attempt
>> to place the inode onto the orphan list. Despite being able to perform
>> i_size extension checking earlier on in the direct IO code path, it
>> makes most sense to perform this bit post successful block allocation.
>>
>> The ->end_io() callback ext4_dio_write_end_io() is responsible for
>> removing the inode from the orphan list and determining if we should
>> truncate a failed write in the case of an error. We also convert a
>> range of unwritten extents to written if IOMAP_DIO_UNWRITTEN is set
>> and perform the necessary i_size/i_disksize extension if the
>> iocb->ki_pos + dio->size > i_size_read(inode).
>>
>> In the instance of a short write, we fallback to buffered IO and
>> complete whatever is left the 'iter'. Any blocks that may have been
>> allocated in preparation for direct IO will be reused by buffered IO,
>> so there's no issue with leaving allocated blocks beyond EOF.
>>
>> Signed-off-by: Matthew Bobrowski <mbobrowski@mbobrowski.org>
> 
> Sorry some minor simplification comments. Forgot to respond in previous 
> email.
> 
> Otherwise looks good.
> 
> Reviewed-by: Ritesh Harjani <riteshh@linux.ibm.com>
> 
> 
> 
>> ---
>>   fs/ext4/file.c  | 219 +++++++++++++++++++++++++++++++++---------------
>>   fs/ext4/inode.c |  57 ++++++++++---
>>   2 files changed, 198 insertions(+), 78 deletions(-)
>>
>> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
>> index 8e586198f6e6..bf22425a6a6f 100644
>> --- a/fs/ext4/file.c
>> +++ b/fs/ext4/file.c
>> @@ -29,6 +29,7 @@
>>   #include <linux/pagevec.h>
>>   #include <linux/uio.h>
>>   #include <linux/mman.h>
>> +#include <linux/backing-dev.h>
>>   #include "ext4.h"
>>   #include "ext4_jbd2.h"
>>   #include "xattr.h"
>> @@ -217,6 +218,14 @@ static ssize_t ext4_write_checks(struct kiocb 
>> *iocb, struct iov_iter *from)
>>       if (ret <= 0)
>>           return ret;
>>
>> +    ret = file_remove_privs(iocb->ki_filp);
>> +    if (ret)
>> +        return 0;
>> +
>> +    ret = file_update_time(iocb->ki_filp);
>> +    if (ret)
>> +        return 0;
>> +
>>       if (unlikely(IS_IMMUTABLE(inode)))
>>           return -EPERM;
>>
>> @@ -234,6 +243,34 @@ static ssize_t ext4_write_checks(struct kiocb 
>> *iocb, struct iov_iter *from)
>>       return iov_iter_count(from);
>>   }
>>
>> +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
>> +                    struct iov_iter *from)
>> +{
>> +    ssize_t ret;
>> +    struct inode *inode = file_inode(iocb->ki_filp);
>> +
>> +    if (iocb->ki_flags & IOCB_NOWAIT)
>> +        return -EOPNOTSUPP;
>> +
>> +    if (!inode_trylock(inode))
>> +        inode_lock(inode);
>> +
>> +    ret = ext4_write_checks(iocb, from);
>> +    if (ret <= 0)
>> +        goto out;
>> +
>> +    current->backing_dev_info = inode_to_bdi(inode);
>> +    ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
>> +    current->backing_dev_info = NULL;
>> +out:
>> +    inode_unlock(inode);
>> +    if (likely(ret > 0)) {
>> +        iocb->ki_pos += ret;
>> +        ret = generic_write_sync(iocb, ret);
>> +    }
>> +    return ret;
>> +}
>> +
>>   static int ext4_handle_inode_extension(struct inode *inode, loff_t 
>> offset,
>>                          ssize_t len, size_t count)
>>   {
>> @@ -311,6 +348,118 @@ static int 
>> ext4_handle_failed_inode_extension(struct inode *inode, loff_t size)
>>       return ret;
>>   }
>>
>> +/*
>> + * For a write that extends the inode size, ext4_dio_write_iter() will
>> + * wait for the write to complete. Consequently, operations performed
>> + * within this function are still covered by the inode_lock().
>> + */
> Maybe add a comment that on success this returns 0.
> 
>> +static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, 
>> int error,
>> +                 unsigned int flags)
>> +{
>> +    int ret = 0;
> No need to initialize ret.
> 
> 
>> +    loff_t offset = iocb->ki_pos;
>> +    struct inode *inode = file_inode(iocb->ki_filp);
>> +
>> +    if (error) {
>> +        ret = ext4_handle_failed_inode_extension(inode, offset + size);
>> +        return ret ? ret : error;
>> +    }
>> +
>> +    if (flags & IOMAP_DIO_UNWRITTEN) {
>> +        ret = ext4_convert_unwritten_extents(NULL, inode,
>> +                             offset, size);
>> +        if (ret)
>> +            return ret;
>> +    }
>> +
>> +    if (offset + size > i_size_read(inode)) {
>> +        ret = ext4_handle_inode_extension(inode, offset, size, 0);
>> +        if (ret)
>> +            return ret;
>> +    }
>> +    return ret;
> Directly return 0, since if it falls here it mans it is a success case.
> You are anyway returning error from above error paths.
> 
>> +}
>> +
>> +static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct 
>> iov_iter *from)
>> +{
>> +    ssize_t ret;
>> +    loff_t offset = iocb->ki_pos;
>> +    size_t count = iov_iter_count(from);
>> +    struct inode *inode = file_inode(iocb->ki_filp);
>> +    bool extend = false, overwrite = false, unaligned_aio = false;
>> +
>> +    if (!inode_trylock(inode)) {
>> +        if (iocb->ki_flags & IOCB_NOWAIT)
>> +            return -EAGAIN;
>> +        inode_lock(inode);
>> +    }
>> +
>> +    if (!ext4_dio_checks(inode)) {
>> +        inode_unlock(inode);
>> +        /*
>> +         * Fallback to buffered IO if the operation on the
>> +         * inode is not supported by direct IO.
>> +         */
>> +        return ext4_buffered_write_iter(iocb, from);
>> +    }
>> +
>> +    ret = ext4_write_checks(iocb, from);
>> +    if (ret <= 0) {
>> +        inode_unlock(inode);
>> +        return ret;
>> +    }
>> +
>> +    /*
>> +     * Unaligned direct AIO must be serialized among each other as
>> +     * the zeroing of partial blocks of two competing unaligned
>> +     * AIOs can result in data corruption.
>> +     */
>> +    if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
>> +        !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, 
>> offset)) {
>> +        unaligned_aio = true;
>> +        inode_dio_wait(inode);
>> +    }
>> +
>> +    /*
>> +     * Determine whether the IO operation will overwrite allocated
>> +     * and initialized blocks. If so, check to see whether it is
>> +     * possible to take the dioread_nolock path.
>> +     */
>> +    if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
>> +        ext4_should_dioread_nolock(inode)) {
>> +        overwrite = true;
>> +        downgrade_write(&inode->i_rwsem);
>> +    }
>> +
>> +    if (offset + count > i_size_read(inode) ||
>> +        offset + count > EXT4_I(inode)->i_disksize) {
>> +        ext4_update_i_disksize(inode, inode->i_size);
>> +        extend = true;
>> +    }
>> +
>> +    ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, 
>> ext4_dio_write_end_io);
>> +
>> +    /*
>> +     * Unaligned direct AIO must be the only IO in flight or else
>> +     * any overlapping aligned IO after unaligned IO might result
>> +     * in data corruption. We also need to wait here in the case
>> +     * where the inode is being extended so that inode extension
>> +     * routines in ext4_dio_write_end_io() are covered by the
>> +     * inode_lock().
>> +     */
>> +    if (ret == -EIOCBQUEUED && (unaligned_aio || extend))
>> +        inode_dio_wait(inode);

So, I looked this more closely into the AIO DIO write extend case
of yours here. AFAICT, this looks good in the sense that it follows
the behavior what we used to have before from __blockdev_direct_IO.
In that it used to wait for AIO DIO writes beyond EOF, but the iomap
framework does not have that. So waiting in case of writes beyond EOF
should be the right thing to do here for ext4 (following the legacy code).

But I would like to confirm the exact race this extend case
is protecting here.
Since writes beyond EOF will require update of inode i_size 
(ext4_update_inode_size()) which require us to hold the inode_lock
in exclusive mode, so we must need to wait in extend case here,
even for AIO DIO writes.

Q1. Is above understanding completely correct?

Q2. Or is there anything else also which it is also protecting which I 
am missing? Do we need to hold inode exclusive lock for
ext4_orphan_del() as well?


Q3. How about XFS then?
(I do see some tricks done with IOLOCK handling in case of ki__pos > 
i_size & to zero out the buffer space between old i_size & ki_pos).

But if we talk only about the above case of extending AIO DIO writes 
beyond EOF, XFS only takes a shared lock. why?

Looking into XFS code, I see that they have IOLOCK & ILOCK.
So I guess for protecting inode->i_size update they use ILOCK (in
xfs_dio_write_end_io() -> xfs_iomap_write_unwritten()
or ip->i_flags_lock lock in NON-UNWRITTEN case).

and for IO part the IOLOCK is used and hence IOLOCK can be used
in shared mode. Is this correct understanding for XFS?

-ritesh

>> +
>> +    if (overwrite)
>> +        inode_unlock_shared(inode);
>> +    else
>> +        inode_unlock(inode);
>> +
>> +    if (ret >= 0 && iov_iter_count(from))
>> +        return ext4_buffered_write_iter(iocb, from);
>> +    return ret;
>> +}
>> +
>>   #ifdef CONFIG_FS_DAX
>>   static ssize_t
>>   ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
>> @@ -325,15 +474,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct 
>> iov_iter *from)
>>               return -EAGAIN;
>>           inode_lock(inode);
>>       }
>> +
>>       ret = ext4_write_checks(iocb, from);
>>       if (ret <= 0)
>>           goto out;
>> -    ret = file_remove_privs(iocb->ki_filp);
>> -    if (ret)
>> -        goto out;
>> -    ret = file_update_time(iocb->ki_filp);
>> -    if (ret)
>> -        goto out;
>>
>>       offset = iocb->ki_pos;
>>       ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
>> @@ -359,73 +503,16 @@ static ssize_t
>>   ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>   {
>>       struct inode *inode = file_inode(iocb->ki_filp);
>> -    int o_direct = iocb->ki_flags & IOCB_DIRECT;
>> -    int unaligned_aio = 0;
>> -    int overwrite = 0;
>> -    ssize_t ret;
>>
>>       if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
>>           return -EIO;
>>
>> -#ifdef CONFIG_FS_DAX
>>       if (IS_DAX(inode))
>>           return ext4_dax_write_iter(iocb, from);
>> -#endif
>> -    if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
>> -        return -EOPNOTSUPP;
>>
>> -    if (!inode_trylock(inode)) {
>> -        if (iocb->ki_flags & IOCB_NOWAIT)
>> -            return -EAGAIN;
>> -        inode_lock(inode);
>> -    }
>> -
>> -    ret = ext4_write_checks(iocb, from);
>> -    if (ret <= 0)
>> -        goto out;
>> -
>> -    /*
>> -     * Unaligned direct AIO must be serialized among each other as 
>> zeroing
>> -     * of partial blocks of two competing unaligned AIOs can result 
>> in data
>> -     * corruption.
>> -     */
>> -    if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
>> -        !is_sync_kiocb(iocb) &&
>> -        ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
>> -        unaligned_aio = 1;
>> -        ext4_unwritten_wait(inode);
>> -    }
>> -
>> -    iocb->private = &overwrite;
>> -    /* Check whether we do a DIO overwrite or not */
>> -    if (o_direct && !unaligned_aio) {
>> -        if (ext4_overwrite_io(inode, iocb->ki_pos, 
>> iov_iter_count(from))) {
>> -            if (ext4_should_dioread_nolock(inode))
>> -                overwrite = 1;
>> -        } else if (iocb->ki_flags & IOCB_NOWAIT) {
>> -            ret = -EAGAIN;
>> -            goto out;
>> -        }
>> -    }
>> -
>> -    ret = __generic_file_write_iter(iocb, from);
>> -    /*
>> -     * Unaligned direct AIO must be the only IO in flight. Otherwise
>> -     * overlapping aligned IO after unaligned might result in data
>> -     * corruption.
>> -     */
>> -    if (ret == -EIOCBQUEUED && unaligned_aio)
>> -        ext4_unwritten_wait(inode);
>> -    inode_unlock(inode);
>> -
>> -    if (ret > 0)
>> -        ret = generic_write_sync(iocb, ret);
>> -
>> -    return ret;
>> -
>> -out:
>> -    inode_unlock(inode);
>> -    return ret;
>> +    if (iocb->ki_flags & IOCB_DIRECT)
>> +        return ext4_dio_write_iter(iocb, from);
>> +    return ext4_buffered_write_iter(iocb, from);
>>   }
>>
>>   #ifdef CONFIG_FS_DAX
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index efb184928e51..f52ad3065236 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -3513,11 +3513,13 @@ static int ext4_iomap_begin(struct inode 
>> *inode, loff_t offset, loff_t length,
>>               }
>>           }
>>       } else if (flags & IOMAP_WRITE) {
>> -        int dio_credits;
>>           handle_t *handle;
>> -        int retries = 0;
>> +        int dio_credits, retries = 0, m_flags = 0;
>>
>> -        /* Trim mapping request to maximum we can map at once for DIO */
>> +        /*
>> +         * Trim mapping request to maximum we can map at once
>> +         * for DIO.
>> +         */
>>           if (map.m_len > DIO_MAX_BLOCKS)
>>               map.m_len = DIO_MAX_BLOCKS;
>>           dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
>> @@ -3533,8 +3535,30 @@ static int ext4_iomap_begin(struct inode 
>> *inode, loff_t offset, loff_t length,
>>           if (IS_ERR(handle))
>>               return PTR_ERR(handle);
>>
>> -        ret = ext4_map_blocks(handle, inode, &map,
>> -                      EXT4_GET_BLOCKS_CREATE_ZERO);
>> +        /*
>> +         * DAX and direct IO are the only two operations that
>> +         * are currently supported with IOMAP_WRITE.
>> +         */
>> +        WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
>> +        if (IS_DAX(inode))
>> +            m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
>> +        else if (round_down(offset, i_blocksize(inode)) >=
>> +             i_size_read(inode))
>> +            m_flags = EXT4_GET_BLOCKS_CREATE;
>> +        else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
>> +            m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
>> +
>> +        ret = ext4_map_blocks(handle, inode, &map, m_flags);
>> +
>> +        /*
>> +         * We cannot fill holes in indirect tree based inodes
>> +         * as that could expose stale data in the case of a
>> +         * crash. Use the magic error code to fallback to
>> +         * buffered IO.
>> +         */
> 
> I like this comment ;)
> Help others to understand what is really going on here.
> 
>> +        if (!m_flags && !ret)
>> +            ret = -ENOTBLK;
>> +
>>           if (ret < 0) {
>>               ext4_journal_stop(handle);
>>               if (ret == -ENOSPC &&
>> @@ -3544,13 +3568,14 @@ static int ext4_iomap_begin(struct inode 
>> *inode, loff_t offset, loff_t length,
>>           }
>>
>>           /*
>> -         * If we added blocks beyond i_size, we need to make sure they
>> -         * will get truncated if we crash before updating i_size in
>> -         * ext4_iomap_end(). For faults we don't need to do that (and
>> -         * even cannot because for orphan list operations inode_lock is
>> -         * required) - if we happen to instantiate block beyond i_size,
>> -         * it is because we race with truncate which has already added
>> -         * the inode to the orphan list.
>> +         * If we added blocks beyond i_size, we need to make
>> +         * sure they will get truncated if we crash before
>> +         * updating the i_size. For faults we don't need to do
>> +         * that (and even cannot because for orphan list
>> +         * operations inode_lock is required) - if we happen
>> +         * to instantiate block beyond i_size, it is because
>> +         * we race with truncate which has already added the
>> +         * inode to the orphan list.
>>            */
>>           if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
>>               (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
>> @@ -3612,6 +3637,14 @@ static int ext4_iomap_begin(struct inode 
>> *inode, loff_t offset, loff_t length,
>>   static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t 
>> length,
>>                 ssize_t written, unsigned flags, struct iomap *iomap)
>>   {
>> +    /*
>> +     * Check to see whether an error occurred while writing data
>> +     * out to allocated blocks. If so, return the magic error code
>> +     * so that we fallback to buffered IO and reuse the blocks
>> +     * that were allocated in preparation for the direct IO write.
>> +     */
>> +    if (flags & IOMAP_DIRECT && written == 0)
>> +        return -ENOTBLK;
>>       return 0;
>>   }
>>
>
Matthew Bobrowski Sept. 10, 2019, 10:31 a.m. UTC | #4
On Mon, Sep 09, 2019 at 02:56:15PM +0530, Ritesh Harjani wrote:
> On 9/9/19 4:49 AM, Matthew Bobrowski wrote:
> > +/*
> > + * For a write that extends the inode size, ext4_dio_write_iter() will
> > + * wait for the write to complete. Consequently, operations performed
> > + * within this function are still covered by the inode_lock().
> > + */
> Maybe add a comment that on success this returns 0.

OK, can do.

> > +static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error,
> > +				 unsigned int flags)
> > +{
> > +	int ret = 0;
> No need to initialize ret.
> 
> 
> > +	loff_t offset = iocb->ki_pos;
> > +	struct inode *inode = file_inode(iocb->ki_filp);
> > +
> > +	if (error) {
> > +		ret = ext4_handle_failed_inode_extension(inode, offset + size);
> > +		return ret ? ret : error;
> > +	}
> > +
> > +	if (flags & IOMAP_DIO_UNWRITTEN) {
> > +		ret = ext4_convert_unwritten_extents(NULL, inode,
> > +						     offset, size);
> > +		if (ret)
> > +			return ret;
> > +	}
> > +
> > +	if (offset + size > i_size_read(inode)) {
> > +		ret = ext4_handle_inode_extension(inode, offset, size, 0);
> > +		if (ret)
> > +			return ret;
> > +	}
> > +	return ret;
> Directly return 0, since if it falls here it mans it is a success case.
> You are anyway returning error from above error paths.

OK, sure.

--<M>--
Matthew Bobrowski Sept. 10, 2019, 11:20 a.m. UTC | #5
On Mon, Sep 09, 2019 at 08:02:13PM +0530, Ritesh Harjani wrote:
> On 9/9/19 2:56 PM, Ritesh Harjani wrote:
> > > +    ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops,
> > > ext4_dio_write_end_io);
> > > +
> > > +    /*
> > > +     * Unaligned direct AIO must be the only IO in flight or else
> > > +     * any overlapping aligned IO after unaligned IO might result
> > > +     * in data corruption. We also need to wait here in the case
> > > +     * where the inode is being extended so that inode extension
> > > +     * routines in ext4_dio_write_end_io() are covered by the
> > > +     * inode_lock().
> > > +     */
> > > +    if (ret == -EIOCBQUEUED && (unaligned_aio || extend))
> > > +        inode_dio_wait(inode);
> 
> So, I looked this more closely into the AIO DIO write extend case
> of yours here. AFAICT, this looks good in the sense that it follows
> the behavior what we used to have before from __blockdev_direct_IO.
> In that it used to wait for AIO DIO writes beyond EOF, but the iomap
> framework does not have that. So waiting in case of writes beyond EOF
> should be the right thing to do here for ext4 (following the legacy code).
> 
> But I would like to confirm the exact race this extend case
> is protecting here.
> Since writes beyond EOF will require update of inode i_size
> (ext4_update_inode_size()) which require us to hold the inode_lock
> in exclusive mode, so we must need to wait in extend case here,
> even for AIO DIO writes.
> 
> Q1. Is above understanding completely correct?

Yes, that's essentially correct.

> Q2. Or is there anything else also which it is also protecting which I am
> missing?

No, I think that's it...

> Do we need to hold inode exclusive lock for ext4_orphan_del() as well?

Yes, I believe so.

> Q3. How about XFS then?
> (I do see some tricks done with IOLOCK handling in case of ki__pos > i_size
> & to zero out the buffer space between old i_size & ki_pos).
> 
> But if we talk only about the above case of extending AIO DIO writes beyond
> EOF, XFS only takes a shared lock. why?
> 
> Looking into XFS code, I see that they have IOLOCK & ILOCK.
> So I guess for protecting inode->i_size update they use ILOCK (in
> xfs_dio_write_end_io() -> xfs_iomap_write_unwritten()
> or ip->i_flags_lock lock in NON-UNWRITTEN case). And for IO part the IOLOCK
> is used and hence IOLOCK can be used in shared mode. Is this correct
> understanding for XFS?

* David/Christoph/Darrick

I haven't looked at the intricate XFS locking semantics, so I can't really
comment until I've looked at the code to be perfectly honest. Perhaps asking
one of the XFS maintainers could get you the answer you're looking for on
this.

--<M>--
Ritesh Harjani Sept. 11, 2019, 8:08 a.m. UTC | #6
Hello,

Few more small things noted.
Please check once.

On 9/9/19 2:56 PM, Ritesh Harjani wrote:
> 
> 
> On 9/9/19 4:49 AM, Matthew Bobrowski wrote:
>> This patch introduces a new direct IO write code path implementation
>> that makes use of the iomap infrastructure.
>>
>> All direct IO write operations are now passed from the ->write_iter()
>> callback to the new function ext4_dio_write_iter(). This function is
>> responsible for calling into iomap infrastructure via
>> iomap_dio_rw(). Snippets of the direct IO code from within
>> ext4_file_write_iter(), such as checking whether the IO request is
>> unaligned asynchronous IO, or whether it will ber overwriting
>> allocated and initialized blocks has been moved out and into
>> ext4_dio_write_iter().
>>
>> The block mapping flags that are passed to ext4_map_blocks() from
>> within ext4_dio_get_block() and friends have effectively been taken
>> out and introduced within the ext4_iomap_begin(). If ext4_map_blocks()
>> happens to have instantiated blocks beyond the i_size, then we attempt
>> to place the inode onto the orphan list. Despite being able to perform
>> i_size extension checking earlier on in the direct IO code path, it
>> makes most sense to perform this bit post successful block allocation.
>>
>> The ->end_io() callback ext4_dio_write_end_io() is responsible for
>> removing the inode from the orphan list and determining if we should
>> truncate a failed write in the case of an error. We also convert a
>> range of unwritten extents to written if IOMAP_DIO_UNWRITTEN is set
>> and perform the necessary i_size/i_disksize extension if the
>> iocb->ki_pos + dio->size > i_size_read(inode).
>>
>> In the instance of a short write, we fallback to buffered IO and
>> complete whatever is left the 'iter'. Any blocks that may have been
>> allocated in preparation for direct IO will be reused by buffered IO,
>> so there's no issue with leaving allocated blocks beyond EOF.
>>
>> Signed-off-by: Matthew Bobrowski <mbobrowski@mbobrowski.org>
> 
> Sorry some minor simplification comments. Forgot to respond in previous 
> email.
> 
> Otherwise looks good.
> 
> Reviewed-by: Ritesh Harjani <riteshh@linux.ibm.com>
> 
> 
> 
>> ---
>>   fs/ext4/file.c  | 219 +++++++++++++++++++++++++++++++++---------------
>>   fs/ext4/inode.c |  57 ++++++++++---
>>   2 files changed, 198 insertions(+), 78 deletions(-)
>>
>> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
>> index 8e586198f6e6..bf22425a6a6f 100644
>> --- a/fs/ext4/file.c
>> +++ b/fs/ext4/file.c
>> @@ -29,6 +29,7 @@
>>   #include <linux/pagevec.h>
>>   #include <linux/uio.h>
>>   #include <linux/mman.h>
>> +#include <linux/backing-dev.h>
>>   #include "ext4.h"
>>   #include "ext4_jbd2.h"
>>   #include "xattr.h"
>> @@ -217,6 +218,14 @@ static ssize_t ext4_write_checks(struct kiocb 
>> *iocb, struct iov_iter *from)
>>       if (ret <= 0)
>>           return ret;
>>
>> +    ret = file_remove_privs(iocb->ki_filp);
>> +    if (ret)
>> +        return 0;
>> +
>> +    ret = file_update_time(iocb->ki_filp);
>> +    if (ret)
>> +        return 0;
>> +
>>       if (unlikely(IS_IMMUTABLE(inode)))
>>           return -EPERM;

Maybe we can move this up. If file is IMMUTABLE no point in
calling for above actions (file_remove_privs/file_updatetime).

also why not use file_modified() API which does the same.

>>
>> @@ -234,6 +243,34 @@ static ssize_t ext4_write_checks(struct kiocb 
>> *iocb, struct iov_iter *from)
>>       return iov_iter_count(from);
>>   }
>>
>> +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
>> +                    struct iov_iter *from)
>> +{
>> +    ssize_t ret;
>> +    struct inode *inode = file_inode(iocb->ki_filp);
>> +
>> +    if (iocb->ki_flags & IOCB_NOWAIT)
>> +        return -EOPNOTSUPP;
>> +
>> +    if (!inode_trylock(inode))
>> +        inode_lock(inode);

Is it really needed to check for trylock first?
we can directly call for inode_lock() here.


>> +
>> +    ret = ext4_write_checks(iocb, from);
>> +    if (ret <= 0)
>> +        goto out;
>> +
>> +    current->backing_dev_info = inode_to_bdi(inode);
>> +    ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
>> +    current->backing_dev_info = NULL;
>> +out:
>> +    inode_unlock(inode);
>> +    if (likely(ret > 0)) {
>> +        iocb->ki_pos += ret;
>> +        ret = generic_write_sync(iocb, ret);
>> +    }
>> +    return ret;
>> +}
>> +
>>   static int ext4_handle_inode_extension(struct inode *inode, loff_t 
>> offset,
>>                          ssize_t len, size_t count)
>>   {
>> @@ -311,6 +348,118 @@ static int 
>> ext4_handle_failed_inode_extension(struct inode *inode, loff_t size)
>>       return ret;
>>   }
>>
>> +/*
>> + * For a write that extends the inode size, ext4_dio_write_iter() will
>> + * wait for the write to complete. Consequently, operations performed
>> + * within this function are still covered by the inode_lock().
>> + */
> Maybe add a comment that on success this returns 0.
> 
>> +static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, 
>> int error,
>> +                 unsigned int flags)
>> +{
>> +    int ret = 0;
> No need to initialize ret.
> 
> 
>> +    loff_t offset = iocb->ki_pos;
>> +    struct inode *inode = file_inode(iocb->ki_filp);
>> +
>> +    if (error) {
>> +        ret = ext4_handle_failed_inode_extension(inode, offset + size);
>> +        return ret ? ret : error;
>> +    }
>> +
>> +    if (flags & IOMAP_DIO_UNWRITTEN) {
>> +        ret = ext4_convert_unwritten_extents(NULL, inode,
>> +                             offset, size);
>> +        if (ret)
>> +            return ret;
>> +    }
>> +
>> +    if (offset + size > i_size_read(inode)) {
>> +        ret = ext4_handle_inode_extension(inode, offset, size, 0);
>> +        if (ret)
>> +            return ret;
>> +    }
>> +    return ret;
> Directly return 0, since if it falls here it mans it is a success case.
> You are anyway returning error from above error paths.
> 
>> +}
>> +
>> +static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct 
>> iov_iter *from)
>> +{
>> +    ssize_t ret;
>> +    loff_t offset = iocb->ki_pos;
>> +    size_t count = iov_iter_count(from);
>> +    struct inode *inode = file_inode(iocb->ki_filp);
>> +    bool extend = false, overwrite = false, unaligned_aio = false;
>> +
>> +    if (!inode_trylock(inode)) {
>> +        if (iocb->ki_flags & IOCB_NOWAIT)
>> +            return -EAGAIN;
>> +        inode_lock(inode);
>> +    }
>> +
>> +    if (!ext4_dio_checks(inode)) {
>> +        inode_unlock(inode);
>> +        /*
>> +         * Fallback to buffered IO if the operation on the
>> +         * inode is not supported by direct IO.
>> +         */
>> +        return ext4_buffered_write_iter(iocb, from);
>> +    }
>> +
>> +    ret = ext4_write_checks(iocb, from);
This can modify the count in iov_iter *from.

>> +    if (ret <= 0) {
>> +        inode_unlock(inode);
>> +        return ret;
>> +    }
let's recalculate count = iov_iter_count(from);

>> +
>> +    /*
>> +     * Unaligned direct AIO must be serialized among each other as
>> +     * the zeroing of partial blocks of two competing unaligned
>> +     * AIOs can result in data corruption.
>> +     */
>> +    if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
>> +        !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, 
>> offset)) {
>> +        unaligned_aio = true;
>> +        inode_dio_wait(inode);
>> +    }
>> +
>> +    /*
>> +     * Determine whether the IO operation will overwrite allocated
>> +     * and initialized blocks. If so, check to see whether it is
>> +     * possible to take the dioread_nolock path.
>> +     */
>> +    if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&count here could be the old one.


>> +        ext4_should_dioread_nolock(inode)) {
>> +        overwrite = true;
>> +        downgrade_write(&inode->i_rwsem);
>> +    }
>> +
>> +    if (offset + count > i_size_read(inode) ||
>> +        offset + count > EXT4_I(inode)->i_disksize) {
ditto.

>> +        ext4_update_i_disksize(inode, inode->i_size);
>> +        extend = true;
>> +    }
>> +
>> +    ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, 
>> ext4_dio_write_end_io);
>> +
>> +    /*
>> +     * Unaligned direct AIO must be the only IO in flight or else
>> +     * any overlapping aligned IO after unaligned IO might result
>> +     * in data corruption. We also need to wait here in the case
>> +     * where the inode is being extended so that inode extension
>> +     * routines in ext4_dio_write_end_io() are covered by the
>> +     * inode_lock().
>> +     */
>> +    if (ret == -EIOCBQUEUED && (unaligned_aio || extend))
>> +        inode_dio_wait(inode);
>> +
>> +    if (overwrite)
>> +        inode_unlock_shared(inode);
>> +    else
>> +        inode_unlock(inode);
>> +
>> +    if (ret >= 0 && iov_iter_count(from))
>> +        return ext4_buffered_write_iter(iocb, from);
>> +    return ret;
>> +}
>> +
>>   #ifdef CONFIG_FS_DAX
>>   static ssize_t
>>   ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
>> @@ -325,15 +474,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct 
>> iov_iter *from)
>>               return -EAGAIN;
>>           inode_lock(inode);
>>       }
>> +
>>       ret = ext4_write_checks(iocb, from);
>>       if (ret <= 0)
>>           goto out;
>> -    ret = file_remove_privs(iocb->ki_filp);
>> -    if (ret)
>> -        goto out;
>> -    ret = file_update_time(iocb->ki_filp);
>> -    if (ret)
>> -        goto out;
>>
>>       offset = iocb->ki_pos;
>>       ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
>> @@ -359,73 +503,16 @@ static ssize_t
>>   ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>   {
>>       struct inode *inode = file_inode(iocb->ki_filp);
>> -    int o_direct = iocb->ki_flags & IOCB_DIRECT;
>> -    int unaligned_aio = 0;
>> -    int overwrite = 0;
>> -    ssize_t ret;
>>
>>       if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
>>           return -EIO;
>>
>> -#ifdef CONFIG_FS_DAX
>>       if (IS_DAX(inode))
>>           return ext4_dax_write_iter(iocb, from);
>> -#endif
>> -    if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
>> -        return -EOPNOTSUPP;
>>
>> -    if (!inode_trylock(inode)) {
>> -        if (iocb->ki_flags & IOCB_NOWAIT)
>> -            return -EAGAIN;
>> -        inode_lock(inode);
>> -    }
>> -
>> -    ret = ext4_write_checks(iocb, from);
>> -    if (ret <= 0)
>> -        goto out;
>> -
>> -    /*
>> -     * Unaligned direct AIO must be serialized among each other as 
>> zeroing
>> -     * of partial blocks of two competing unaligned AIOs can result 
>> in data
>> -     * corruption.
>> -     */
>> -    if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
>> -        !is_sync_kiocb(iocb) &&
>> -        ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
>> -        unaligned_aio = 1;
>> -        ext4_unwritten_wait(inode);
>> -    }
>> -
>> -    iocb->private = &overwrite;
>> -    /* Check whether we do a DIO overwrite or not */
>> -    if (o_direct && !unaligned_aio) {
>> -        if (ext4_overwrite_io(inode, iocb->ki_pos, 
>> iov_iter_count(from))) {
>> -            if (ext4_should_dioread_nolock(inode))
>> -                overwrite = 1;
>> -        } else if (iocb->ki_flags & IOCB_NOWAIT) {
>> -            ret = -EAGAIN;
>> -            goto out;
>> -        }
>> -    }
>> -
>> -    ret = __generic_file_write_iter(iocb, from);
>> -    /*
>> -     * Unaligned direct AIO must be the only IO in flight. Otherwise
>> -     * overlapping aligned IO after unaligned might result in data
>> -     * corruption.
>> -     */
>> -    if (ret == -EIOCBQUEUED && unaligned_aio)
>> -        ext4_unwritten_wait(inode);
>> -    inode_unlock(inode);
>> -
>> -    if (ret > 0)
>> -        ret = generic_write_sync(iocb, ret);
>> -
>> -    return ret;
>> -
>> -out:
>> -    inode_unlock(inode);
>> -    return ret;
>> +    if (iocb->ki_flags & IOCB_DIRECT)
>> +        return ext4_dio_write_iter(iocb, from);
>> +    return ext4_buffered_write_iter(iocb, from);
>>   }
>>
>>   #ifdef CONFIG_FS_DAX
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index efb184928e51..f52ad3065236 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -3513,11 +3513,13 @@ static int ext4_iomap_begin(struct inode 
>> *inode, loff_t offset, loff_t length,
>>               }
>>           }
>>       } else if (flags & IOMAP_WRITE) {
>> -        int dio_credits;
>>           handle_t *handle;
>> -        int retries = 0;
>> +        int dio_credits, retries = 0, m_flags = 0;
>>
>> -        /* Trim mapping request to maximum we can map at once for DIO */
>> +        /*
>> +         * Trim mapping request to maximum we can map at once
>> +         * for DIO.
>> +         */
>>           if (map.m_len > DIO_MAX_BLOCKS)
>>               map.m_len = DIO_MAX_BLOCKS;
>>           dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
>> @@ -3533,8 +3535,30 @@ static int ext4_iomap_begin(struct inode 
>> *inode, loff_t offset, loff_t length,
>>           if (IS_ERR(handle))
>>               return PTR_ERR(handle);
>>
>> -        ret = ext4_map_blocks(handle, inode, &map,
>> -                      EXT4_GET_BLOCKS_CREATE_ZERO);
>> +        /*
>> +         * DAX and direct IO are the only two operations that
>> +         * are currently supported with IOMAP_WRITE.
>> +         */
>> +        WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
>> +        if (IS_DAX(inode))
>> +            m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
>> +        else if (round_down(offset, i_blocksize(inode)) >=
>> +             i_size_read(inode))
>> +            m_flags = EXT4_GET_BLOCKS_CREATE;
>> +        else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
>> +            m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
>> +
>> +        ret = ext4_map_blocks(handle, inode, &map, m_flags);
>> +
>> +        /*
>> +         * We cannot fill holes in indirect tree based inodes
>> +         * as that could expose stale data in the case of a
>> +         * crash. Use the magic error code to fallback to
>> +         * buffered IO.
>> +         */
> 
> I like this comment ;)
> Help others to understand what is really going on here.
> 
>> +        if (!m_flags && !ret)
>> +            ret = -ENOTBLK;
>> +
>>           if (ret < 0) {
>>               ext4_journal_stop(handle);
>>               if (ret == -ENOSPC &&
>> @@ -3544,13 +3568,14 @@ static int ext4_iomap_begin(struct inode 
>> *inode, loff_t offset, loff_t length,
>>           }
>>
>>           /*
>> -         * If we added blocks beyond i_size, we need to make sure they
>> -         * will get truncated if we crash before updating i_size in
>> -         * ext4_iomap_end(). For faults we don't need to do that (and
>> -         * even cannot because for orphan list operations inode_lock is
>> -         * required) - if we happen to instantiate block beyond i_size,
>> -         * it is because we race with truncate which has already added
>> -         * the inode to the orphan list.
>> +         * If we added blocks beyond i_size, we need to make
>> +         * sure they will get truncated if we crash before
>> +         * updating the i_size. For faults we don't need to do
>> +         * that (and even cannot because for orphan list
>> +         * operations inode_lock is required) - if we happen
>> +         * to instantiate block beyond i_size, it is because
>> +         * we race with truncate which has already added the
>> +         * inode to the orphan list.
>>            */
>>           if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
>>               (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
>> @@ -3612,6 +3637,14 @@ static int ext4_iomap_begin(struct inode 
>> *inode, loff_t offset, loff_t length,
>>   static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t 
>> length,
>>                 ssize_t written, unsigned flags, struct iomap *iomap)
>>   {
>> +    /*
>> +     * Check to see whether an error occurred while writing data
>> +     * out to allocated blocks. If so, return the magic error code
>> +     * so that we fallback to buffered IO and reuse the blocks
>> +     * that were allocated in preparation for the direct IO write.
>> +     */
>> +    if (flags & IOMAP_DIRECT && written == 0)
>> +        return -ENOTBLK;
>>       return 0;
>>   }
>>
>
Matthew Bobrowski Sept. 11, 2019, 12:39 p.m. UTC | #7
On Wed, Sep 11, 2019 at 01:38:52PM +0530, Ritesh Harjani wrote:
> On 9/9/19 2:56 PM, Ritesh Harjani wrote:
> > On 9/9/19 4:49 AM, Matthew Bobrowski wrote:
> > > @@ -217,6 +218,14 @@ static ssize_t ext4_write_checks(struct kiocb
> > > *iocb, struct iov_iter *from)
> > >       if (ret <= 0)
> > >           return ret;
> > > 
> > > +    ret = file_remove_privs(iocb->ki_filp);
> > > +    if (ret)
> > > +        return 0;
> > > +
> > > +    ret = file_update_time(iocb->ki_filp);
> > > +    if (ret)
> > > +        return 0;
> > > +
> > >       if (unlikely(IS_IMMUTABLE(inode)))
> > >           return -EPERM;
> 
> Maybe we can move this up. If file is IMMUTABLE no point in
> calling for above actions (file_remove_privs/file_updatetime).

Yep, sure could do this. In fact, I think we could put this above
generic_write_checks().

> Also why not use file_modified() API which does the same.

Ah, nice. Indeed we can, thanks for simplifying it.

> > > @@ -234,6 +243,34 @@ static ssize_t ext4_write_checks(struct kiocb
> > > *iocb, struct iov_iter *from)
> > >       return iov_iter_count(from);
> > >   }
> > > 
> > > +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
> > > +                    struct iov_iter *from)
> > > +{
> > > +    ssize_t ret;
> > > +    struct inode *inode = file_inode(iocb->ki_filp);
> > > +
> > > +    if (iocb->ki_flags & IOCB_NOWAIT)
> > > +        return -EOPNOTSUPP;
> > > +
> > > +    if (!inode_trylock(inode))
> > > +        inode_lock(inode);
> 
> Is it really needed to check for trylock first?
> we can directly call for inode_lock() here.

You're right, no need to do this dance. We can call inode_lock() directly.

> > > +
> > > +    ret = ext4_write_checks(iocb, from);
> > > +    if (ret <= 0)
> > > +        goto out;
> > > +
> > > +    current->backing_dev_info = inode_to_bdi(inode);
> > > +    ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
> > > +    current->backing_dev_info = NULL;
> > > +out:
> > > +    inode_unlock(inode);
> > > +    if (likely(ret > 0)) {
> > > +        iocb->ki_pos += ret;
> > > +        ret = generic_write_sync(iocb, ret);
> > > +    }
> > > +    return ret;
> > > +}
> > > +
> > > +    if (!ext4_dio_checks(inode)) {
> > > +        inode_unlock(inode);
> > > +        /*
> > > +         * Fallback to buffered IO if the operation on the
> > > +         * inode is not supported by direct IO.
> > > +         */
> > > +        return ext4_buffered_write_iter(iocb, from);
> > > +    }
> > > +
> > > +    ret = ext4_write_checks(iocb, from);
> This can modify the count in iov_iter *from.

Good point. We'll recalculate the iter 'count' again.

Thank you for the review/suggestions, highly appreciated.

--<M>--

Patch
diff mbox series

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8e586198f6e6..bf22425a6a6f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -29,6 +29,7 @@ 
 #include <linux/pagevec.h>
 #include <linux/uio.h>
 #include <linux/mman.h>
+#include <linux/backing-dev.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -217,6 +218,14 @@  static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
 	if (ret <= 0)
 		return ret;
 
+	ret = file_remove_privs(iocb->ki_filp);
+	if (ret)
+		return 0;
+
+	ret = file_update_time(iocb->ki_filp);
+	if (ret)
+		return 0;
+
 	if (unlikely(IS_IMMUTABLE(inode)))
 		return -EPERM;
 
@@ -234,6 +243,34 @@  static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
 	return iov_iter_count(from);
 }
 
+static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
+					struct iov_iter *from)
+{
+	ssize_t ret;
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	if (iocb->ki_flags & IOCB_NOWAIT)
+		return -EOPNOTSUPP;
+
+	if (!inode_trylock(inode))
+		inode_lock(inode);
+
+	ret = ext4_write_checks(iocb, from);
+	if (ret <= 0)
+		goto out;
+
+	current->backing_dev_info = inode_to_bdi(inode);
+	ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
+	current->backing_dev_info = NULL;
+out:
+	inode_unlock(inode);
+	if (likely(ret > 0)) {
+		iocb->ki_pos += ret;
+		ret = generic_write_sync(iocb, ret);
+	}
+	return ret;
+}
+
 static int ext4_handle_inode_extension(struct inode *inode, loff_t offset,
 				       ssize_t len, size_t count)
 {
@@ -311,6 +348,118 @@  static int ext4_handle_failed_inode_extension(struct inode *inode, loff_t size)
 	return ret;
 }
 
+/*
+ * For a write that extends the inode size, ext4_dio_write_iter() will
+ * wait for the write to complete. Consequently, operations performed
+ * within this function are still covered by the inode_lock().
+ */
+static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error,
+				 unsigned int flags)
+{
+	int ret = 0;
+	loff_t offset = iocb->ki_pos;
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	if (error) {
+		ret = ext4_handle_failed_inode_extension(inode, offset + size);
+		return ret ? ret : error;
+	}
+
+	if (flags & IOMAP_DIO_UNWRITTEN) {
+		ret = ext4_convert_unwritten_extents(NULL, inode,
+						     offset, size);
+		if (ret)
+			return ret;
+	}
+
+	if (offset + size > i_size_read(inode)) {
+		ret = ext4_handle_inode_extension(inode, offset, size, 0);
+		if (ret)
+			return ret;
+	}
+	return ret;
+}
+
+static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	ssize_t ret;
+	loff_t offset = iocb->ki_pos;
+	size_t count = iov_iter_count(from);
+	struct inode *inode = file_inode(iocb->ki_filp);
+	bool extend = false, overwrite = false, unaligned_aio = false;
+
+	if (!inode_trylock(inode)) {
+		if (iocb->ki_flags & IOCB_NOWAIT)
+			return -EAGAIN;
+		inode_lock(inode);
+	}
+
+	if (!ext4_dio_checks(inode)) {
+		inode_unlock(inode);
+		/*
+		 * Fallback to buffered IO if the operation on the
+		 * inode is not supported by direct IO.
+		 */
+		return ext4_buffered_write_iter(iocb, from);
+	}
+
+	ret = ext4_write_checks(iocb, from);
+	if (ret <= 0) {
+		inode_unlock(inode);
+		return ret;
+	}
+
+	/*
+	 * Unaligned direct AIO must be serialized among each other as
+	 * the zeroing of partial blocks of two competing unaligned
+	 * AIOs can result in data corruption.
+	 */
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+	    !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
+		unaligned_aio = true;
+		inode_dio_wait(inode);
+	}
+
+	/*
+	 * Determine whether the IO operation will overwrite allocated
+	 * and initialized blocks. If so, check to see whether it is
+	 * possible to take the dioread_nolock path.
+	 */
+	if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
+	    ext4_should_dioread_nolock(inode)) {
+		overwrite = true;
+		downgrade_write(&inode->i_rwsem);
+	}
+
+	if (offset + count > i_size_read(inode) ||
+	    offset + count > EXT4_I(inode)->i_disksize) {
+		ext4_update_i_disksize(inode, inode->i_size);
+		extend = true;
+	}
+
+	ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, ext4_dio_write_end_io);
+
+	/*
+	 * Unaligned direct AIO must be the only IO in flight or else
+	 * any overlapping aligned IO after unaligned IO might result
+	 * in data corruption. We also need to wait here in the case
+	 * where the inode is being extended so that inode extension
+	 * routines in ext4_dio_write_end_io() are covered by the
+	 * inode_lock().
+	 */
+	if (ret == -EIOCBQUEUED && (unaligned_aio || extend))
+		inode_dio_wait(inode);
+
+	if (overwrite)
+		inode_unlock_shared(inode);
+	else
+		inode_unlock(inode);
+
+	if (ret >= 0 && iov_iter_count(from))
+		return ext4_buffered_write_iter(iocb, from);
+	return ret;
+}
+
 #ifdef CONFIG_FS_DAX
 static ssize_t
 ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
@@ -325,15 +474,10 @@  ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			return -EAGAIN;
 		inode_lock(inode);
 	}
+
 	ret = ext4_write_checks(iocb, from);
 	if (ret <= 0)
 		goto out;
-	ret = file_remove_privs(iocb->ki_filp);
-	if (ret)
-		goto out;
-	ret = file_update_time(iocb->ki_filp);
-	if (ret)
-		goto out;
 
 	offset = iocb->ki_pos;
 	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
@@ -359,73 +503,16 @@  static ssize_t
 ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
-	int o_direct = iocb->ki_flags & IOCB_DIRECT;
-	int unaligned_aio = 0;
-	int overwrite = 0;
-	ssize_t ret;
 
 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 		return -EIO;
 
-#ifdef CONFIG_FS_DAX
 	if (IS_DAX(inode))
 		return ext4_dax_write_iter(iocb, from);
-#endif
-	if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
-		return -EOPNOTSUPP;
 
-	if (!inode_trylock(inode)) {
-		if (iocb->ki_flags & IOCB_NOWAIT)
-			return -EAGAIN;
-		inode_lock(inode);
-	}
-
-	ret = ext4_write_checks(iocb, from);
-	if (ret <= 0)
-		goto out;
-
-	/*
-	 * Unaligned direct AIO must be serialized among each other as zeroing
-	 * of partial blocks of two competing unaligned AIOs can result in data
-	 * corruption.
-	 */
-	if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
-	    !is_sync_kiocb(iocb) &&
-	    ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
-		unaligned_aio = 1;
-		ext4_unwritten_wait(inode);
-	}
-
-	iocb->private = &overwrite;
-	/* Check whether we do a DIO overwrite or not */
-	if (o_direct && !unaligned_aio) {
-		if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
-			if (ext4_should_dioread_nolock(inode))
-				overwrite = 1;
-		} else if (iocb->ki_flags & IOCB_NOWAIT) {
-			ret = -EAGAIN;
-			goto out;
-		}
-	}
-
-	ret = __generic_file_write_iter(iocb, from);
-	/*
-	 * Unaligned direct AIO must be the only IO in flight. Otherwise
-	 * overlapping aligned IO after unaligned might result in data
-	 * corruption.
-	 */
-	if (ret == -EIOCBQUEUED && unaligned_aio)
-		ext4_unwritten_wait(inode);
-	inode_unlock(inode);
-
-	if (ret > 0)
-		ret = generic_write_sync(iocb, ret);
-
-	return ret;
-
-out:
-	inode_unlock(inode);
-	return ret;
+	if (iocb->ki_flags & IOCB_DIRECT)
+		return ext4_dio_write_iter(iocb, from);
+	return ext4_buffered_write_iter(iocb, from);
 }
 
 #ifdef CONFIG_FS_DAX
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index efb184928e51..f52ad3065236 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3513,11 +3513,13 @@  static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 			}
 		}
 	} else if (flags & IOMAP_WRITE) {
-		int dio_credits;
 		handle_t *handle;
-		int retries = 0;
+		int dio_credits, retries = 0, m_flags = 0;
 
-		/* Trim mapping request to maximum we can map at once for DIO */
+		/*
+		 * Trim mapping request to maximum we can map at once
+		 * for DIO.
+		 */
 		if (map.m_len > DIO_MAX_BLOCKS)
 			map.m_len = DIO_MAX_BLOCKS;
 		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
@@ -3533,8 +3535,30 @@  static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		if (IS_ERR(handle))
 			return PTR_ERR(handle);
 
-		ret = ext4_map_blocks(handle, inode, &map,
-				      EXT4_GET_BLOCKS_CREATE_ZERO);
+		/*
+		 * DAX and direct IO are the only two operations that
+		 * are currently supported with IOMAP_WRITE.
+		 */
+		WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
+		if (IS_DAX(inode))
+			m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
+		else if (round_down(offset, i_blocksize(inode)) >=
+			 i_size_read(inode))
+			m_flags = EXT4_GET_BLOCKS_CREATE;
+		else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+			m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
+
+		ret = ext4_map_blocks(handle, inode, &map, m_flags);
+
+		/*
+		 * We cannot fill holes in indirect tree based inodes
+		 * as that could expose stale data in the case of a
+		 * crash. Use the magic error code to fallback to
+		 * buffered IO.
+		 */
+		if (!m_flags && !ret)
+			ret = -ENOTBLK;
+
 		if (ret < 0) {
 			ext4_journal_stop(handle);
 			if (ret == -ENOSPC &&
@@ -3544,13 +3568,14 @@  static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		}
 
 		/*
-		 * If we added blocks beyond i_size, we need to make sure they
-		 * will get truncated if we crash before updating i_size in
-		 * ext4_iomap_end(). For faults we don't need to do that (and
-		 * even cannot because for orphan list operations inode_lock is
-		 * required) - if we happen to instantiate block beyond i_size,
-		 * it is because we race with truncate which has already added
-		 * the inode to the orphan list.
+		 * If we added blocks beyond i_size, we need to make
+		 * sure they will get truncated if we crash before
+		 * updating the i_size. For faults we don't need to do
+		 * that (and even cannot because for orphan list
+		 * operations inode_lock is required) - if we happen
+		 * to instantiate block beyond i_size, it is because
+		 * we race with truncate which has already added the
+		 * inode to the orphan list.
 		 */
 		if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
 		    (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
@@ -3612,6 +3637,14 @@  static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
 			  ssize_t written, unsigned flags, struct iomap *iomap)
 {
+	/*
+	 * Check to see whether an error occurred while writing data
+	 * out to allocated blocks. If so, return the magic error code
+	 * so that we fallback to buffered IO and reuse the blocks
+	 * that were allocated in preparation for the direct IO write.
+	 */
+	if (flags & IOMAP_DIRECT && written == 0)
+		return -ENOTBLK;
 	return 0;
 }