diff mbox series

[v3,2/6] ext4: move inode extension/truncate code out from ext4_iomap_end()

Message ID 784214745d589dd2bdcde2d69a69e837e6980592.1568282664.git.mbobrowski@mbobrowski.org
State Superseded
Headers show
Series ext4: port direct IO to iomap infrastructure | expand

Commit Message

Matthew Bobrowski Sept. 12, 2019, 11:04 a.m. UTC
In preparation for implementing the iomap direct IO write path
modifications, the inode extension/truncate code needs to be moved out
from ext4_iomap_end(). For direct IO, if the current code remained
within ext4_iomap_end() it would behave incorrectly. If we update the
inode size prior to converting unwritten extents we run the risk of
allowing a racing direct IO read operation to find unwritten extents
before they are converted.

The inode extension/truncate code has been moved out into a new helper
ext4_handle_inode_extension(). This helper has been designed so that
it can be used by both DAX and direct IO paths in the instance that
the result of the write is extending the inode.

Signed-off-by: Matthew Bobrowski <mbobrowski@mbobrowski.org>
Reviewed-by: Ritesh Harjani <riteshh@linux.ibm.com>
---
 fs/ext4/file.c  | 92 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/inode.c | 48 +-------------------------
 2 files changed, 92 insertions(+), 48 deletions(-)

Comments

Jan Kara Sept. 23, 2019, 4:21 p.m. UTC | #1
On Thu 12-09-19 21:04:00, Matthew Bobrowski wrote:
> @@ -233,12 +234,90 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>  	return iov_iter_count(from);
>  }
>  
> +static int ext4_handle_inode_extension(struct inode *inode, loff_t offset,
> +				       ssize_t len, size_t count)

Traditionally, we call one of the length arguments 'copied' or 'written' to
denote actual amount of data processed and the original length is called
'len' or 'length' in iomap code. Can you please rename the arguments to
follow this convention?

> +{
> +	handle_t *handle;
> +	bool truncate = false;
> +	ext4_lblk_t written_blk, end_blk;
> +	int ret = 0, blkbits = inode->i_blkbits;
> +
> +	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> +	if (IS_ERR(handle)) {
> +		ret = PTR_ERR(handle);
> +		goto orphan_del;
> +	}
> +
> +	if (ext4_update_inode_size(inode, offset + len))
> +		ext4_mark_inode_dirty(handle, inode);
> +
> +	/*
> +	 * We may need truncate allocated but not written blocks
> +	 * beyond EOF.
> +	 */
> +	written_blk = ALIGN(offset + len, 1 << blkbits);
> +	end_blk = ALIGN(offset + count, 1 << blkbits);
> +	if (written_blk < end_blk && ext4_can_truncate(inode))
> +		truncate = true;
> +
> +	/*
> +	 * Remove the inode from the orphan list if it has been
> +	 * extended and everything went OK.
> +	 */
> +	if (!truncate && inode->i_nlink)
> +		ext4_orphan_del(handle, inode);
> +	ext4_journal_stop(handle);
> +
> +	if (truncate) {
> +		ext4_truncate_failed_write(inode);
> +orphan_del:
> +		/*
> +		 * If the truncate operation failed early the inode
> +		 * may still be on the orphan list. In that case, we
> +		 * need try remove the inode from the linked list in
> +		 * memory.
> +		 */
> +		if (inode->i_nlink)
> +			ext4_orphan_del(NULL, inode);
> +	}
> +	return ret;
> +}
> +
> +/*
> + * The inode may have been placed onto the orphan list or has had
> + * blocks allocated beyond EOF as a result of an extension. We need to
> + * ensure that any necessary cleanup routines are performed if the
> + * error path has been taken for a write.
> + */
> +static int ext4_handle_failed_inode_extension(struct inode *inode, loff_t size)
> +{
> +	handle_t *handle;
> +
> +	if (size > i_size_read(inode))
> +		ext4_truncate_failed_write(inode);
> +
> +	if (!list_empty(&EXT4_I(inode)->i_orphan)) {
> +		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> +		if (IS_ERR(handle)) {
> +			if (inode->i_nlink)
> +				ext4_orphan_del(NULL, inode);
> +			return PTR_ERR(handle);
> +		}
> +		if (inode->i_nlink)
> +			ext4_orphan_del(handle, inode);
> +		ext4_journal_stop(handle);
> +	}
> +	return 0;
> +}
> +

After some thought, I'd just drop this function and fold the functionality
into ext4_handle_inode_extension() by making it accept negative 'len'
argument indicating error. The code just happens to be the simplest in that
case (see below).

> @@ -255,7 +334,18 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	if (ret)
>  		goto out;
>  
> +	offset = iocb->ki_pos;
>  	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
> +	if (ret > 0 && iocb->ki_pos > i_size_read(inode))
> +		error = ext4_handle_inode_extension(inode, offset, ret,
> +						    iov_iter_count(from));

You need to sample iov_iter_count(from) before calling dax_iomap_rw(). At
this point iov_iter_count(from) is just what's left in the iter after
writing what we could.

Also I don't think the condition iocb->ki_pos > i_size_read(inode) is
correct here. Because it may happen that offset + count > i_size so we
allocate some blocks beyond i_size but then we manage to copy only less so
offset + ret == iocb->ki_pos <= i_size and you will not call
ext4_handle_inode_extension() to truncate allocated blocks beyond i_size.

So I'd just call ext4_handle_inode_extension() unconditionally like:

	error = ext4_handle_inode_extension(inode, offset, ret, len);

and have a quick check at the beginning of that function to avoid starting
transaction when there isn't anything to do. Something like:

	/*
	 * DIO and DAX writes get exclusion from truncate (i_rwsem) and
	 * page writeback (i_rwsem and flushing all dirty pages).
	 */
	WARN_ON_ONCE(i_size_read(inode) != EXT4_I(inode)->i_disksize);
	if (offset + count <= i_size_read(inode))
		return 0;
	if (len < 0)
		goto truncate;

	... do the heavylifting with transaction start, inode size update,
	and orphan handling...

	if (truncate) {
truncate:
		ext4_truncate_failed_write(inode);
orphan_del:
		/*
		 * If the truncate operation failed early the inode
		 * may still be on the orphan list. In that case, we
		 * need try remove the inode from the linked list in
		 * memory.
		 */
		if (inode->i_nlink)
			ext4_orphan_del(NULL, inode);
	}

> +
> +	if (ret < 0)
> +		error = ext4_handle_failed_inode_extension(inode,
> +							   iocb->ki_pos);
> +
> +	if (error)
> +		ret = error;
>  out:
>  	inode_unlock(inode);
>  	if (ret > 0)

								Honza
Matthew Bobrowski Sept. 24, 2019, 9:50 a.m. UTC | #2
On Mon, Sep 23, 2019 at 06:21:15PM +0200, Jan Kara wrote:
> On Thu 12-09-19 21:04:00, Matthew Bobrowski wrote:
> > @@ -233,12 +234,90 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
> >  	return iov_iter_count(from);
> >  }
> >  
> > +static int ext4_handle_inode_extension(struct inode *inode, loff_t offset,
> > +				       ssize_t len, size_t count)
> 
> Traditionally, we call one of the length arguments 'copied' or 'written' to
> denote actual amount of data processed and the original length is called
> 'len' or 'length' in iomap code. Can you please rename the arguments to
> follow this convention?

Sure, I will go with 'written'.

> > +/*
> > + * The inode may have been placed onto the orphan list or has had
> > + * blocks allocated beyond EOF as a result of an extension. We need to
> > + * ensure that any necessary cleanup routines are performed if the
> > + * error path has been taken for a write.
> > + */
> > +static int ext4_handle_failed_inode_extension(struct inode *inode, loff_t size)
> > +{
> > +	handle_t *handle;
> > +
> > +	if (size > i_size_read(inode))
> > +		ext4_truncate_failed_write(inode);
> > +
> > +	if (!list_empty(&EXT4_I(inode)->i_orphan)) {
> > +		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> > +		if (IS_ERR(handle)) {
> > +			if (inode->i_nlink)
> > +				ext4_orphan_del(NULL, inode);
> > +			return PTR_ERR(handle);
> > +		}
> > +		if (inode->i_nlink)
> > +			ext4_orphan_del(handle, inode);
> > +		ext4_journal_stop(handle);
> > +	}
> > +	return 0;
> > +}
> > +
> 
> After some thought, I'd just drop this function and fold the functionality
> into ext4_handle_inode_extension() by making it accept negative 'len'
> argument indicating error. The code just happens to be the simplest in that
> case (see below).
> 
> > @@ -255,7 +334,18 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >  	if (ret)
> >  		goto out;
> >  
> > +	offset = iocb->ki_pos;
> >  	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
> > +	if (ret > 0 && iocb->ki_pos > i_size_read(inode))
> > +		error = ext4_handle_inode_extension(inode, offset, ret,
> > +						    iov_iter_count(from));
> 
> You need to sample iov_iter_count(from) before calling dax_iomap_rw(). At
> this point iov_iter_count(from) is just what's left in the iter after
> writing what we could.
> 
> Also I don't think the condition iocb->ki_pos > i_size_read(inode) is
> correct here. Because it may happen that offset + count > i_size so we
> allocate some blocks beyond i_size but then we manage to copy only less so
> offset + ret == iocb->ki_pos <= i_size and you will not call
> ext4_handle_inode_extension() to truncate allocated blocks beyond i_size.

Yeah, this makes sense. I will implement it this way. Once again, appreciate
your suggestions.
 
> So I'd just call ext4_handle_inode_extension() unconditionally like:
> 
> 	error = ext4_handle_inode_extension(inode, offset, ret, len);
> 
> and have a quick check at the beginning of that function to avoid starting
> transaction when there isn't anything to do. Something like:
> 
> 	/*
> 	 * DIO and DAX writes get exclusion from truncate (i_rwsem) and
> 	 * page writeback (i_rwsem and flushing all dirty pages).
> 	 */
> 	WARN_ON_ONCE(i_size_read(inode) != EXT4_I(inode)->i_disksize);
> 	if (offset + count <= i_size_read(inode))
> 		return 0;
> 	if (len < 0)
> 		goto truncate;
> 
> 	... do the heavylifting with transaction start, inode size update,
> 	and orphan handling...
> 
> 	if (truncate) {
> truncate:
> 		ext4_truncate_failed_write(inode);
> orphan_del:
> 		/*
> 		 * If the truncate operation failed early the inode
> 		 * may still be on the orphan list. In that case, we
> 		 * need try remove the inode from the linked list in
> 		 * memory.
> 		 */
> 		if (inode->i_nlink)
> 			ext4_orphan_del(NULL, inode);
> 	}

--<M>--
Jan Kara Sept. 24, 2019, 1:13 p.m. UTC | #3
On Mon 23-09-19 18:21:15, Jan Kara wrote:
> On Thu 12-09-19 21:04:00, Matthew Bobrowski wrote:
> So I'd just call ext4_handle_inode_extension() unconditionally like:
> 
> 	error = ext4_handle_inode_extension(inode, offset, ret, len);
> 
> and have a quick check at the beginning of that function to avoid starting
> transaction when there isn't anything to do. Something like:
> 
> 	/*
> 	 * DIO and DAX writes get exclusion from truncate (i_rwsem) and
> 	 * page writeback (i_rwsem and flushing all dirty pages).
> 	 */
> 	WARN_ON_ONCE(i_size_read(inode) != EXT4_I(inode)->i_disksize);
> 	if (offset + count <= i_size_read(inode))
> 		return 0;

One correction here. With commit 45d8ec4d9fd54 in mind this should be:

 	/*
 	 * DIO and DAX writes get exclusion from truncate (i_rwsem). There
 	 * can be pending delalloc writeback beyond the range written by
 	 * DIO though.
 	 */
 	WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
 	if (offset + count <= EXT4_I(inode)->i_disksize)
 		return 0;

								Honza

> 	if (len < 0)
> 		goto truncate;
> 
> 	... do the heavylifting with transaction start, inode size update,
> 	and orphan handling...
> 
> 	if (truncate) {
> truncate:
> 		ext4_truncate_failed_write(inode);
> orphan_del:
> 		/*
> 		 * If the truncate operation failed early the inode
> 		 * may still be on the orphan list. In that case, we
> 		 * need try remove the inode from the linked list in
> 		 * memory.
> 		 */
> 		if (inode->i_nlink)
> 			ext4_orphan_del(NULL, inode);
> 	}
diff mbox series

Patch

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index e52e3928dc25..6457c629b8cf 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -33,6 +33,7 @@ 
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
+#include "truncate.h"
 
 static bool ext4_dio_checks(struct inode *inode)
 {
@@ -233,12 +234,90 @@  static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
 	return iov_iter_count(from);
 }
 
+static int ext4_handle_inode_extension(struct inode *inode, loff_t offset,
+				       ssize_t len, size_t count)
+{
+	handle_t *handle;
+	bool truncate = false;
+	ext4_lblk_t written_blk, end_blk;
+	int ret = 0, blkbits = inode->i_blkbits;
+
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto orphan_del;
+	}
+
+	if (ext4_update_inode_size(inode, offset + len))
+		ext4_mark_inode_dirty(handle, inode);
+
+	/*
+	 * We may need truncate allocated but not written blocks
+	 * beyond EOF.
+	 */
+	written_blk = ALIGN(offset + len, 1 << blkbits);
+	end_blk = ALIGN(offset + count, 1 << blkbits);
+	if (written_blk < end_blk && ext4_can_truncate(inode))
+		truncate = true;
+
+	/*
+	 * Remove the inode from the orphan list if it has been
+	 * extended and everything went OK.
+	 */
+	if (!truncate && inode->i_nlink)
+		ext4_orphan_del(handle, inode);
+	ext4_journal_stop(handle);
+
+	if (truncate) {
+		ext4_truncate_failed_write(inode);
+orphan_del:
+		/*
+		 * If the truncate operation failed early the inode
+		 * may still be on the orphan list. In that case, we
+		 * need try remove the inode from the linked list in
+		 * memory.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+	return ret;
+}
+
+/*
+ * The inode may have been placed onto the orphan list or has had
+ * blocks allocated beyond EOF as a result of an extension. We need to
+ * ensure that any necessary cleanup routines are performed if the
+ * error path has been taken for a write.
+ */
+static int ext4_handle_failed_inode_extension(struct inode *inode, loff_t size)
+{
+	handle_t *handle;
+
+	if (size > i_size_read(inode))
+		ext4_truncate_failed_write(inode);
+
+	if (!list_empty(&EXT4_I(inode)->i_orphan)) {
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+		if (IS_ERR(handle)) {
+			if (inode->i_nlink)
+				ext4_orphan_del(NULL, inode);
+			return PTR_ERR(handle);
+		}
+		if (inode->i_nlink)
+			ext4_orphan_del(handle, inode);
+		ext4_journal_stop(handle);
+	}
+	return 0;
+}
+
 #ifdef CONFIG_FS_DAX
 static ssize_t
 ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
-	struct inode *inode = file_inode(iocb->ki_filp);
 	ssize_t ret;
+	int error = 0;
+	loff_t offset;
+	struct inode *inode = file_inode(iocb->ki_filp);
 
 	if (!inode_trylock(inode)) {
 		if (iocb->ki_flags & IOCB_NOWAIT)
@@ -255,7 +334,18 @@  ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret)
 		goto out;
 
+	offset = iocb->ki_pos;
 	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
+	if (ret > 0 && iocb->ki_pos > i_size_read(inode))
+		error = ext4_handle_inode_extension(inode, offset, ret,
+						    iov_iter_count(from));
+
+	if (ret < 0)
+		error = ext4_handle_failed_inode_extension(inode,
+							   iocb->ki_pos);
+
+	if (error)
+		ret = error;
 out:
 	inode_unlock(inode);
 	if (ret > 0)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 420fe3deed39..761ce6286b05 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3601,53 +3601,7 @@  static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
 			  ssize_t written, unsigned flags, struct iomap *iomap)
 {
-	int ret = 0;
-	handle_t *handle;
-	int blkbits = inode->i_blkbits;
-	bool truncate = false;
-
-	if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
-		return 0;
-
-	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto orphan_del;
-	}
-	if (ext4_update_inode_size(inode, offset + written))
-		ext4_mark_inode_dirty(handle, inode);
-	/*
-	 * We may need to truncate allocated but not written blocks beyond EOF.
-	 */
-	if (iomap->offset + iomap->length > 
-	    ALIGN(inode->i_size, 1 << blkbits)) {
-		ext4_lblk_t written_blk, end_blk;
-
-		written_blk = (offset + written) >> blkbits;
-		end_blk = (offset + length) >> blkbits;
-		if (written_blk < end_blk && ext4_can_truncate(inode))
-			truncate = true;
-	}
-	/*
-	 * Remove inode from orphan list if we were extending a inode and
-	 * everything went fine.
-	 */
-	if (!truncate && inode->i_nlink &&
-	    !list_empty(&EXT4_I(inode)->i_orphan))
-		ext4_orphan_del(handle, inode);
-	ext4_journal_stop(handle);
-	if (truncate) {
-		ext4_truncate_failed_write(inode);
-orphan_del:
-		/*
-		 * If truncate failed early the inode might still be on the
-		 * orphan list; we need to make sure the inode is removed from
-		 * the orphan list in that case.
-		 */
-		if (inode->i_nlink)
-			ext4_orphan_del(NULL, inode);
-	}
-	return ret;
+	return 0;
 }
 
 const struct iomap_ops ext4_iomap_ops = {