diff mbox series

[v2,2/6] ext4: move inode extension/truncate code out from ext4_iomap_end()

Message ID c1e9b23ced988587dfec399021a5b62983745842.1567978633.git.mbobrowski@mbobrowski.org
State Superseded
Headers show
Series ext4: port direct IO to iomap infrastructure | expand

Commit Message

Matthew Bobrowski Sept. 8, 2019, 11:19 p.m. UTC
In preparation for implementing the iomap direct IO write path
modifications, the inode extension/truncate code needs to be moved out
from ext4_iomap_end(). For direct IO, if the current code remained
within ext4_iomap_end() it would behave incorrectly. If we update the
inode size prior to converting unwritten extents we run the risk of
allowing a racing direct IO read operation to find unwritten extents
before they are converted.

The inode extension/truncate code has been moved out into a new helper
ext4_handle_inode_extension(). This helper has been designed so that
it can be used by both DAX and direct IO paths in the instance that
the result of the write is extending the inode.

Signed-off-by: Matthew Bobrowski <mbobrowski@mbobrowski.org>
---
 fs/ext4/file.c  | 93 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/inode.c | 48 +------------------------
 2 files changed, 93 insertions(+), 48 deletions(-)

Comments

Ritesh Harjani Sept. 9, 2019, 8:17 a.m. UTC | #1
On 9/9/19 4:49 AM, Matthew Bobrowski wrote:
> In preparation for implementing the iomap direct IO write path
> modifications, the inode extension/truncate code needs to be moved out
> from ext4_iomap_end(). For direct IO, if the current code remained
> within ext4_iomap_end() it would behave incorrectly. If we update the
> inode size prior to converting unwritten extents we run the risk of
> allowing a racing direct IO read operation to find unwritten extents
> before they are converted.
> 
> The inode extension/truncate code has been moved out into a new helper
> ext4_handle_inode_extension(). This helper has been designed so that
> it can be used by both DAX and direct IO paths in the instance that
> the result of the write is extending the inode.
> 
> Signed-off-by: Matthew Bobrowski <mbobrowski@mbobrowski.org>
> ---
>   fs/ext4/file.c  | 93 ++++++++++++++++++++++++++++++++++++++++++++++++-
>   fs/ext4/inode.c | 48 +------------------------
>   2 files changed, 93 insertions(+), 48 deletions(-)
> 
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index e52e3928dc25..8e586198f6e6 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -33,6 +33,7 @@
>   #include "ext4_jbd2.h"
>   #include "xattr.h"
>   #include "acl.h"
> +#include "truncate.h"
> 
>   static bool ext4_dio_checks(struct inode *inode)
>   {
> @@ -233,12 +234,91 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>   	return iov_iter_count(from);
>   }
> 
> +static int ext4_handle_inode_extension(struct inode *inode, loff_t offset,
> +				       ssize_t len, size_t count)
> +{
> +	handle_t *handle;
> +	bool truncate = false;
> +	ext4_lblk_t written_blk, end_blk;
> +	int ret = 0, blkbits = inode->i_blkbits;
> +
> +	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> +	if (IS_ERR(handle)) {
> +		ret = PTR_ERR(handle);
> +		goto orphan_del;
> +	}
> +
> +	if (ext4_update_inode_size(inode, offset + len))
> +		ext4_mark_inode_dirty(handle, inode);
> +
> +	/*
> +	 * We may need truncate allocated but not written blocks
> +	 * beyond EOF.
> +	 */
> +	written_blk = ALIGN(offset + len, 1 << blkbits);
> +	end_blk = ALIGN(offset + len + count, 1 << blkbits);

why add len in end_blk calculation?
shouldn't this be like below?
	end_blk = ALIGN(offset + count, 1 << blkbits);

> +	if (written_blk < end_blk && ext4_can_truncate(inode))
> +		truncate = true;
> +
> +	/*
> +	 * Remove the inode from the orphan list if it has been
> +	 * extended and everything went OK.
> +	 */
> +	if (!truncate && inode->i_nlink)
> +		ext4_orphan_del(handle, inode);
> +	ext4_journal_stop(handle);
> +
> +	if (truncate) {
> +		ext4_truncate_failed_write(inode);
> +orphan_del:
> +		/*
> +		 * If the truncate operation failed early the inode
> +		 * may still be on the orphan list. In that case, we
> +		 * need try remove the inode from the linked list in
> +		 * memory.
> +		 */
> +		if (inode->i_nlink)
> +			ext4_orphan_del(NULL, inode);
> +	}
> +	return ret;
> +}
> +
> +/*
> + * The inode may have been placed onto the orphan list or has had
> + * blocks allocated beyond EOF as a result of an extension. We need to
> + * ensure that any necessary cleanup routines are performed if the
> + * error path has been taken for a write.
> + */
> +static int ext4_handle_failed_inode_extension(struct inode *inode, loff_t size)
> +{
> +	int ret = 0;

No need of ret anyways.


> +	handle_t *handle;
> +
> +	if (size > i_size_read(inode))
> +		ext4_truncate_failed_write(inode);
> +
> +	if (!list_empty(&EXT4_I(inode)->i_orphan)) {
> +		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> +		if (IS_ERR(handle)) {
> +			if (inode->i_nlink)
> +				ext4_orphan_del(NULL, inode);
> +			return PTR_ERR(handle);
> +		}
> +		if (inode->i_nlink)
> +			ext4_orphan_del(handle, inode);
> +		ext4_journal_stop(handle);
> +	}
> +	return ret;

can directly call for `return 0;`

> +}
> +
>   #ifdef CONFIG_FS_DAX
>   static ssize_t
>   ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
>   {
> -	struct inode *inode = file_inode(iocb->ki_filp);
>   	ssize_t ret;
> +	int error = 0;
> +	loff_t offset;
> +	struct inode *inode = file_inode(iocb->ki_filp);
> 
>   	if (!inode_trylock(inode)) {
>   		if (iocb->ki_flags & IOCB_NOWAIT)
> @@ -255,7 +335,18 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
>   	if (ret)
>   		goto out;
> 
> +	offset = iocb->ki_pos;
>   	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
> +	if (ret > 0 && iocb->ki_pos > i_size_read(inode))
> +		error = ext4_handle_inode_extension(inode, offset, ret,
> +						    iov_iter_count(from));
> +
> +	if (ret < 0)
> +		error = ext4_handle_failed_inode_extension(inode,
> +							   iocb->ki_pos);
> +
> +	if (error)
> +		ret = error;
>   out:
>   	inode_unlock(inode);
>   	if (ret > 0)
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 420fe3deed39..761ce6286b05 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3601,53 +3601,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>   static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
>   			  ssize_t written, unsigned flags, struct iomap *iomap)
>   {
> -	int ret = 0;
> -	handle_t *handle;
> -	int blkbits = inode->i_blkbits;
> -	bool truncate = false;
> -
> -	if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
> -		return 0;
> -
> -	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> -	if (IS_ERR(handle)) {
> -		ret = PTR_ERR(handle);
> -		goto orphan_del;
> -	}
> -	if (ext4_update_inode_size(inode, offset + written))
> -		ext4_mark_inode_dirty(handle, inode);
> -	/*
> -	 * We may need to truncate allocated but not written blocks beyond EOF.
> -	 */
> -	if (iomap->offset + iomap->length >
> -	    ALIGN(inode->i_size, 1 << blkbits)) {
> -		ext4_lblk_t written_blk, end_blk;
> -
> -		written_blk = (offset + written) >> blkbits;
> -		end_blk = (offset + length) >> blkbits;
> -		if (written_blk < end_blk && ext4_can_truncate(inode))
> -			truncate = true;
> -	}
> -	/*
> -	 * Remove inode from orphan list if we were extending a inode and
> -	 * everything went fine.
> -	 */
> -	if (!truncate && inode->i_nlink &&
> -	    !list_empty(&EXT4_I(inode)->i_orphan))
> -		ext4_orphan_del(handle, inode);
> -	ext4_journal_stop(handle);
> -	if (truncate) {
> -		ext4_truncate_failed_write(inode);
> -orphan_del:
> -		/*
> -		 * If truncate failed early the inode might still be on the
> -		 * orphan list; we need to make sure the inode is removed from
> -		 * the orphan list in that case.
> -		 */
> -		if (inode->i_nlink)
> -			ext4_orphan_del(NULL, inode);
> -	}
> -	return ret;
> +	return 0;
>   }
> 
>   const struct iomap_ops ext4_iomap_ops = {
>
Matthew Bobrowski Sept. 10, 2019, 10:26 a.m. UTC | #2
On Mon, Sep 09, 2019 at 01:47:28PM +0530, Ritesh Harjani wrote:
> On 9/9/19 4:49 AM, Matthew Bobrowski wrote:
> > +static int ext4_handle_inode_extension(struct inode *inode, loff_t offset,
> > +				       ssize_t len, size_t count)
> > +{
> > +	handle_t *handle;
> > +	bool truncate = false;
> > +	ext4_lblk_t written_blk, end_blk;
> > +	int ret = 0, blkbits = inode->i_blkbits;
> > +
> > +	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> > +	if (IS_ERR(handle)) {
> > +		ret = PTR_ERR(handle);
> > +		goto orphan_del;
> > +	}
> > +
> > +	if (ext4_update_inode_size(inode, offset + len))
> > +		ext4_mark_inode_dirty(handle, inode);
> > +
> > +	/*
> > +	 * We may need truncate allocated but not written blocks
> > +	 * beyond EOF.
> > +	 */
> > +	written_blk = ALIGN(offset + len, 1 << blkbits);
> > +	end_blk = ALIGN(offset + len + count, 1 << blkbits);
> 
> why add len in end_blk calculation?
> shouldn't this be like below?
> 	end_blk = ALIGN(offset + count, 1 << blkbits);

I don't believe that would be entirely correct. The reason being is that the
'end_blk' is meant to represent the last logical block which we should expect
to have used for the write operation. So, we have the 'offset' which
represents starting point, 'len' which is the amount of data that has been
written, and 'count' which is the amount of data that we still have left over
in the 'iter', if any.

The count in the 'iter' is decremented as that data is copied from it. So if
we did use 'offset' + 'count', in the instance of a short write, we
potentially wouldn't truncate any of the allocated but not written blocks. I
guess this would hold true for the DAX code path at this point, seeing as
though for the DIO case we pass in '0'.

> > +/*
> > + * The inode may have been placed onto the orphan list or has had
> > + * blocks allocated beyond EOF as a result of an extension. We need to
> > + * ensure that any necessary cleanup routines are performed if the
> > + * error path has been taken for a write.
> > + */
> > +static int ext4_handle_failed_inode_extension(struct inode *inode, loff_t size)
> > +{
> > +	int ret = 0;
> 
> No need of ret anyways.
> 
> 
> > +	handle_t *handle;
> > +
> > +	if (size > i_size_read(inode))
> > +		ext4_truncate_failed_write(inode);
> > +
> > +	if (!list_empty(&EXT4_I(inode)->i_orphan)) {
> > +		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> > +		if (IS_ERR(handle)) {
> > +			if (inode->i_nlink)
> > +				ext4_orphan_del(NULL, inode);
> > +			return PTR_ERR(handle);
> > +		}
> > +		if (inode->i_nlink)
> > +			ext4_orphan_del(handle, inode);
> > +		ext4_journal_stop(handle);
> > +	}
> > +	return ret;
> 
> can directly call for `return 0;`

True.

--<M>--
Ritesh Harjani Sept. 10, 2019, 11:16 a.m. UTC | #3
On 9/10/19 3:56 PM, Matthew Bobrowski wrote:
> On Mon, Sep 09, 2019 at 01:47:28PM +0530, Ritesh Harjani wrote:
>> On 9/9/19 4:49 AM, Matthew Bobrowski wrote:
>>> +static int ext4_handle_inode_extension(struct inode *inode, loff_t offset,
>>> +				       ssize_t len, size_t count)
>>> +{
>>> +	handle_t *handle;
>>> +	bool truncate = false;
>>> +	ext4_lblk_t written_blk, end_blk;
>>> +	int ret = 0, blkbits = inode->i_blkbits;
>>> +
>>> +	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
>>> +	if (IS_ERR(handle)) {
>>> +		ret = PTR_ERR(handle);
>>> +		goto orphan_del;
>>> +	}
>>> +
>>> +	if (ext4_update_inode_size(inode, offset + len))
>>> +		ext4_mark_inode_dirty(handle, inode);
>>> +
>>> +	/*
>>> +	 * We may need truncate allocated but not written blocks
>>> +	 * beyond EOF.
>>> +	 */
>>> +	written_blk = ALIGN(offset + len, 1 << blkbits);
>>> +	end_blk = ALIGN(offset + len + count, 1 << blkbits);
>>
>> why add len in end_blk calculation?
>> shouldn't this be like below?
>> 	end_blk = ALIGN(offset + count, 1 << blkbits);
> 
> I don't believe that would be entirely correct. The reason being is that the
> 'end_blk' is meant to represent the last logical block which we should expect
> to have used for the write operation. So, we have the 'offset' which
> represents starting point, 'len' which is the amount of data that has been
> written, and 'count' which is the amount of data that we still have left over
> in the 'iter', if any.
> 

Agree. Yes, I see that you are passing iov_iter_count(from) as a param,
after the dax write.


> The count in the 'iter' is decremented as that data is copied from it. So if > we did use 'offset' + 'count', in the instance of a short write, we
> potentially wouldn't truncate any of the allocated but not written blocks. I
> guess this would hold true for the DAX code path at this point, seeing as
> though for the DIO case we pass in '0'.

Agreed.


> 
>>> +/*
>>> + * The inode may have been placed onto the orphan list or has had
>>> + * blocks allocated beyond EOF as a result of an extension. We need to
>>> + * ensure that any necessary cleanup routines are performed if the
>>> + * error path has been taken for a write.
>>> + */
>>> +static int ext4_handle_failed_inode_extension(struct inode *inode, loff_t size)
>>> +{
>>> +	int ret = 0;
>>
>> No need of ret anyways.
>>
>>
>>> +	handle_t *handle;
>>> +
>>> +	if (size > i_size_read(inode))
>>> +		ext4_truncate_failed_write(inode);
>>> +
>>> +	if (!list_empty(&EXT4_I(inode)->i_orphan)) {
>>> +		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
>>> +		if (IS_ERR(handle)) {
>>> +			if (inode->i_nlink)
>>> +				ext4_orphan_del(NULL, inode);
>>> +			return PTR_ERR(handle);
>>> +		}
>>> +		if (inode->i_nlink)
>>> +			ext4_orphan_del(handle, inode);
>>> +		ext4_journal_stop(handle);
>>> +	}
>>> +	return ret;
>>
>> can directly call for `return 0;`
> 
> True.
> 
> --<M>--
>
diff mbox series

Patch

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index e52e3928dc25..8e586198f6e6 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -33,6 +33,7 @@ 
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
+#include "truncate.h"
 
 static bool ext4_dio_checks(struct inode *inode)
 {
@@ -233,12 +234,91 @@  static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
 	return iov_iter_count(from);
 }
 
+static int ext4_handle_inode_extension(struct inode *inode, loff_t offset,
+				       ssize_t len, size_t count)
+{
+	handle_t *handle;
+	bool truncate = false;
+	ext4_lblk_t written_blk, end_blk;
+	int ret = 0, blkbits = inode->i_blkbits;
+
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto orphan_del;
+	}
+
+	if (ext4_update_inode_size(inode, offset + len))
+		ext4_mark_inode_dirty(handle, inode);
+
+	/*
+	 * We may need truncate allocated but not written blocks
+	 * beyond EOF.
+	 */
+	written_blk = ALIGN(offset + len, 1 << blkbits);
+	end_blk = ALIGN(offset + len + count, 1 << blkbits);
+	if (written_blk < end_blk && ext4_can_truncate(inode))
+		truncate = true;
+
+	/*
+	 * Remove the inode from the orphan list if it has been
+	 * extended and everything went OK.
+	 */
+	if (!truncate && inode->i_nlink)
+		ext4_orphan_del(handle, inode);
+	ext4_journal_stop(handle);
+
+	if (truncate) {
+		ext4_truncate_failed_write(inode);
+orphan_del:
+		/*
+		 * If the truncate operation failed early the inode
+		 * may still be on the orphan list. In that case, we
+		 * need try remove the inode from the linked list in
+		 * memory.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+	return ret;
+}
+
+/*
+ * The inode may have been placed onto the orphan list or has had
+ * blocks allocated beyond EOF as a result of an extension. We need to
+ * ensure that any necessary cleanup routines are performed if the
+ * error path has been taken for a write.
+ */
+static int ext4_handle_failed_inode_extension(struct inode *inode, loff_t size)
+{
+	int ret = 0;
+	handle_t *handle;
+
+	if (size > i_size_read(inode))
+		ext4_truncate_failed_write(inode);
+
+	if (!list_empty(&EXT4_I(inode)->i_orphan)) {
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+		if (IS_ERR(handle)) {
+			if (inode->i_nlink)
+				ext4_orphan_del(NULL, inode);
+			return PTR_ERR(handle);
+		}
+		if (inode->i_nlink)
+			ext4_orphan_del(handle, inode);
+		ext4_journal_stop(handle);
+	}
+	return ret;
+}
+
 #ifdef CONFIG_FS_DAX
 static ssize_t
 ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
-	struct inode *inode = file_inode(iocb->ki_filp);
 	ssize_t ret;
+	int error = 0;
+	loff_t offset;
+	struct inode *inode = file_inode(iocb->ki_filp);
 
 	if (!inode_trylock(inode)) {
 		if (iocb->ki_flags & IOCB_NOWAIT)
@@ -255,7 +335,18 @@  ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret)
 		goto out;
 
+	offset = iocb->ki_pos;
 	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
+	if (ret > 0 && iocb->ki_pos > i_size_read(inode))
+		error = ext4_handle_inode_extension(inode, offset, ret,
+						    iov_iter_count(from));
+
+	if (ret < 0)
+		error = ext4_handle_failed_inode_extension(inode,
+							   iocb->ki_pos);
+
+	if (error)
+		ret = error;
 out:
 	inode_unlock(inode);
 	if (ret > 0)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 420fe3deed39..761ce6286b05 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3601,53 +3601,7 @@  static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
 			  ssize_t written, unsigned flags, struct iomap *iomap)
 {
-	int ret = 0;
-	handle_t *handle;
-	int blkbits = inode->i_blkbits;
-	bool truncate = false;
-
-	if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
-		return 0;
-
-	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto orphan_del;
-	}
-	if (ext4_update_inode_size(inode, offset + written))
-		ext4_mark_inode_dirty(handle, inode);
-	/*
-	 * We may need to truncate allocated but not written blocks beyond EOF.
-	 */
-	if (iomap->offset + iomap->length > 
-	    ALIGN(inode->i_size, 1 << blkbits)) {
-		ext4_lblk_t written_blk, end_blk;
-
-		written_blk = (offset + written) >> blkbits;
-		end_blk = (offset + length) >> blkbits;
-		if (written_blk < end_blk && ext4_can_truncate(inode))
-			truncate = true;
-	}
-	/*
-	 * Remove inode from orphan list if we were extending a inode and
-	 * everything went fine.
-	 */
-	if (!truncate && inode->i_nlink &&
-	    !list_empty(&EXT4_I(inode)->i_orphan))
-		ext4_orphan_del(handle, inode);
-	ext4_journal_stop(handle);
-	if (truncate) {
-		ext4_truncate_failed_write(inode);
-orphan_del:
-		/*
-		 * If truncate failed early the inode might still be on the
-		 * orphan list; we need to make sure the inode is removed from
-		 * the orphan list in that case.
-		 */
-		if (inode->i_nlink)
-			ext4_orphan_del(NULL, inode);
-	}
-	return ret;
+	return 0;
 }
 
 const struct iomap_ops ext4_iomap_ops = {