diff mbox series

[05/27] ext4: refactor ext4_zero_range()

Message ID 20241022111059.2566137-6-yi.zhang@huaweicloud.com
State New
Headers show
Series ext4: use iomap for regular file's buffered I/O path and enable large folio | expand

Commit Message

Zhang Yi Oct. 22, 2024, 11:10 a.m. UTC
From: Zhang Yi <yi.zhang@huawei.com>

The current implementation of ext4_zero_range() contains complex
position calculations and stale error tags. To improve the code's
clarity and maintainability, it is essential to clean up the code and
improve its readability, this can be achieved by: a) simplifying and
renaming variables, making the style the same as ext4_punch_hole(); b)
eliminating unnecessary position calculations, writing back all data in
data=journal mode, and drop page cache from the original offset to the
end, rather than using aligned blocks; c) renaming the stale out_mutex
tags.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/extents.c | 161 +++++++++++++++++++---------------------------
 1 file changed, 65 insertions(+), 96 deletions(-)

Comments

Jan Kara Dec. 4, 2024, 11:52 a.m. UTC | #1
On Tue 22-10-24 19:10:36, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> The current implementation of ext4_zero_range() contains complex
> position calculations and stale error tags. To improve the code's
> clarity and maintainability, it is essential to clean up the code and
> improve its readability, this can be achieved by: a) simplifying and
> renaming variables, making the style the same as ext4_punch_hole(); b)
> eliminating unnecessary position calculations, writing back all data in
> data=journal mode, and drop page cache from the original offset to the
> end, rather than using aligned blocks; c) renaming the stale out_mutex
> tags.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>

...

> -		goto out_mutex;
> -
> -	/* Preallocate the range including the unaligned edges */
> -	if (partial_begin || partial_end) {
> -		ret = ext4_alloc_file_blocks(file,
> -				round_down(offset, 1 << blkbits) >> blkbits,
> -				(round_up((offset + len), 1 << blkbits) -
> -				 round_down(offset, 1 << blkbits)) >> blkbits,
> -				new_size, flags);
> -		if (ret)
> -			goto out_mutex;
> -
> -	}

So I think we should keep this first ext4_alloc_file_blocks() call before
we truncate the page cache. Otherwise if ext4_alloc_file_blocks() fails due
to ENOSPC, we have already lost the dirty data originally in the zeroed
range. All the other failure modes are kind of catastrophic anyway, so they
are fine after dropping the page cache. But this is can be quite common and
should be handled more gracefully.

								Honza

> -
> -	/* Zero range excluding the unaligned edges */
> -	if (max_blocks > 0) {
> -		flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
> -			  EXT4_EX_NOCACHE);
> +		goto out;
>  
> -		/*
> -		 * Prevent page faults from reinstantiating pages we have
> -		 * released from page cache.
> -		 */
> -		filemap_invalidate_lock(mapping);
> +	/*
> +	 * Prevent page faults from reinstantiating pages we have released
> +	 * from page cache.
> +	 */
> +	filemap_invalidate_lock(mapping);
>  
> -		ret = ext4_break_layouts(inode);
> -		if (ret) {
> -			filemap_invalidate_unlock(mapping);
> -			goto out_mutex;
> -		}
> +	ret = ext4_break_layouts(inode);
> +	if (ret)
> +		goto out_invalidate_lock;
>  
> +	/*
> +	 * For journalled data we need to write (and checkpoint) pages before
> +	 * discarding page cache to avoid inconsitent data on disk in case of
> +	 * crash before zeroing trans is committed.
> +	 */
> +	if (ext4_should_journal_data(inode)) {
> +		ret = filemap_write_and_wait_range(mapping, offset, end - 1);
> +	} else {
>  		ret = ext4_update_disksize_before_punch(inode, offset, len);
> -		if (ret) {
> -			filemap_invalidate_unlock(mapping);
> -			goto out_mutex;
> -		}
> +		ext4_truncate_folios_range(inode, offset, end);
> +	}
> +	if (ret)
> +		goto out_invalidate_lock;
>  
> -		/*
> -		 * For journalled data we need to write (and checkpoint) pages
> -		 * before discarding page cache to avoid inconsitent data on
> -		 * disk in case of crash before zeroing trans is committed.
> -		 */
> -		if (ext4_should_journal_data(inode)) {
> -			ret = filemap_write_and_wait_range(mapping, start,
> -							   end - 1);
> -			if (ret) {
> -				filemap_invalidate_unlock(mapping);
> -				goto out_mutex;
> -			}
> -		}
> +	/* Now release the pages and zero block aligned part of pages */
> +	truncate_pagecache_range(inode, offset, end - 1);
>  
> -		/* Now release the pages and zero block aligned part of pages */
> -		ext4_truncate_folios_range(inode, start, end);
> -		truncate_pagecache_range(inode, start, end - 1);
> +	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
> +	/* Preallocate the range including the unaligned edges */
> +	if (offset & (blocksize - 1) || end & (blocksize - 1)) {
> +		ext4_lblk_t alloc_lblk = offset >> blkbits;
> +		ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits);
>  
> -		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
> -					     flags);
> -		filemap_invalidate_unlock(mapping);
> +		ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk,
> +					     new_size, flags);
>  		if (ret)
> -			goto out_mutex;
> +			goto out_invalidate_lock;
>  	}
> -	if (!partial_begin && !partial_end)
> -		goto out_mutex;
> +
> +	/* Zero range excluding the unaligned edges */
> +	start_lblk = round_up(offset, blocksize) >> blkbits;
> +	end_lblk = end >> blkbits;
> +	if (end_lblk > start_lblk) {
> +		ext4_lblk_t zero_blks = end_lblk - start_lblk;
> +
> +		flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE);
> +		ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
> +					     new_size, flags);
> +		if (ret)
> +			goto out_invalidate_lock;
> +	}
> +	/* Finish zeroing out if it doesn't contain partial block */
> +	if (!(offset & (blocksize - 1)) && !(end & (blocksize - 1)))
> +		goto out_invalidate_lock;
>  
>  	/*
>  	 * In worst case we have to writeout two nonadjacent unwritten
> @@ -4700,25 +4665,29 @@ static long ext4_zero_range(struct file *file, loff_t offset,
>  	if (IS_ERR(handle)) {
>  		ret = PTR_ERR(handle);
>  		ext4_std_error(inode->i_sb, ret);
> -		goto out_mutex;
> +		goto out_invalidate_lock;
>  	}
>  
> +	/* Zero out partial block at the edges of the range */
> +	ret = ext4_zero_partial_blocks(handle, inode, offset, len);
> +	if (ret)
> +		goto out_handle;
> +
>  	if (new_size)
>  		ext4_update_inode_size(inode, new_size);
>  	ret = ext4_mark_inode_dirty(handle, inode);
>  	if (unlikely(ret))
>  		goto out_handle;
> -	/* Zero out partial block at the edges of the range */
> -	ret = ext4_zero_partial_blocks(handle, inode, offset, len);
> -	if (ret >= 0)
> -		ext4_update_inode_fsync_trans(handle, inode, 1);
>  
> +	ext4_update_inode_fsync_trans(handle, inode, 1);
>  	if (file->f_flags & O_SYNC)
>  		ext4_handle_sync(handle);
>  
>  out_handle:
>  	ext4_journal_stop(handle);
> -out_mutex:
> +out_invalidate_lock:
> +	filemap_invalidate_unlock(mapping);
> +out:
>  	inode_unlock(inode);
>  	return ret;
>  }
> -- 
> 2.46.1
>
Zhang Yi Dec. 6, 2024, 8:09 a.m. UTC | #2
On 2024/12/4 19:52, Jan Kara wrote:
> On Tue 22-10-24 19:10:36, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> The current implementation of ext4_zero_range() contains complex
>> position calculations and stale error tags. To improve the code's
>> clarity and maintainability, it is essential to clean up the code and
>> improve its readability, this can be achieved by: a) simplifying and
>> renaming variables, making the style the same as ext4_punch_hole(); b)
>> eliminating unnecessary position calculations, writing back all data in
>> data=journal mode, and drop page cache from the original offset to the
>> end, rather than using aligned blocks; c) renaming the stale out_mutex
>> tags.
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> 
> ...
> 
>> -		goto out_mutex;
>> -
>> -	/* Preallocate the range including the unaligned edges */
>> -	if (partial_begin || partial_end) {
>> -		ret = ext4_alloc_file_blocks(file,
>> -				round_down(offset, 1 << blkbits) >> blkbits,
>> -				(round_up((offset + len), 1 << blkbits) -
>> -				 round_down(offset, 1 << blkbits)) >> blkbits,
>> -				new_size, flags);
>> -		if (ret)
>> -			goto out_mutex;
>> -
>> -	}
> 
> So I think we should keep this first ext4_alloc_file_blocks() call before
> we truncate the page cache. Otherwise if ext4_alloc_file_blocks() fails due
> to ENOSPC, we have already lost the dirty data originally in the zeroed
> range. All the other failure modes are kind of catastrophic anyway, so they
> are fine after dropping the page cache. But this is can be quite common and
> should be handled more gracefully.
> 

Ha, right, I missed this error case, I will revise it.

Thanks,
Yi.
diff mbox series

Patch

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index aa07b5ddaff8..f843342e5164 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4565,40 +4565,15 @@  static long ext4_zero_range(struct file *file, loff_t offset,
 	struct inode *inode = file_inode(file);
 	struct address_space *mapping = file->f_mapping;
 	handle_t *handle = NULL;
-	unsigned int max_blocks;
 	loff_t new_size = 0;
-	int ret = 0;
-	int flags;
-	int credits;
-	int partial_begin, partial_end;
-	loff_t start, end;
-	ext4_lblk_t lblk;
+	loff_t end = offset + len;
+	ext4_lblk_t start_lblk, end_lblk;
+	unsigned int blocksize = i_blocksize(inode);
 	unsigned int blkbits = inode->i_blkbits;
+	int ret, flags, credits;
 
 	trace_ext4_zero_range(inode, offset, len, mode);
 
-	/*
-	 * Round up offset. This is not fallocate, we need to zero out
-	 * blocks, so convert interior block aligned part of the range to
-	 * unwritten and possibly manually zero out unaligned parts of the
-	 * range. Here, start and partial_begin are inclusive, end and
-	 * partial_end are exclusive.
-	 */
-	start = round_up(offset, 1 << blkbits);
-	end = round_down((offset + len), 1 << blkbits);
-
-	if (start < offset || end > offset + len)
-		return -EINVAL;
-	partial_begin = offset & ((1 << blkbits) - 1);
-	partial_end = (offset + len) & ((1 << blkbits) - 1);
-
-	lblk = start >> blkbits;
-	max_blocks = (end >> blkbits);
-	if (max_blocks < lblk)
-		max_blocks = 0;
-	else
-		max_blocks -= lblk;
-
 	inode_lock(inode);
 
 	/*
@@ -4606,88 +4581,78 @@  static long ext4_zero_range(struct file *file, loff_t offset,
 	 */
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 		ret = -EOPNOTSUPP;
-		goto out_mutex;
+		goto out;
 	}
 
 	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-	    (offset + len > inode->i_size ||
-	     offset + len > EXT4_I(inode)->i_disksize)) {
-		new_size = offset + len;
+	    (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
+		new_size = end;
 		ret = inode_newsize_ok(inode, new_size);
 		if (ret)
-			goto out_mutex;
+			goto out;
 	}
 
-	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
-
 	/* Wait all existing dio workers, newcomers will block on i_rwsem */
 	inode_dio_wait(inode);
 
 	ret = file_modified(file);
 	if (ret)
-		goto out_mutex;
-
-	/* Preallocate the range including the unaligned edges */
-	if (partial_begin || partial_end) {
-		ret = ext4_alloc_file_blocks(file,
-				round_down(offset, 1 << blkbits) >> blkbits,
-				(round_up((offset + len), 1 << blkbits) -
-				 round_down(offset, 1 << blkbits)) >> blkbits,
-				new_size, flags);
-		if (ret)
-			goto out_mutex;
-
-	}
-
-	/* Zero range excluding the unaligned edges */
-	if (max_blocks > 0) {
-		flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
-			  EXT4_EX_NOCACHE);
+		goto out;
 
-		/*
-		 * Prevent page faults from reinstantiating pages we have
-		 * released from page cache.
-		 */
-		filemap_invalidate_lock(mapping);
+	/*
+	 * Prevent page faults from reinstantiating pages we have released
+	 * from page cache.
+	 */
+	filemap_invalidate_lock(mapping);
 
-		ret = ext4_break_layouts(inode);
-		if (ret) {
-			filemap_invalidate_unlock(mapping);
-			goto out_mutex;
-		}
+	ret = ext4_break_layouts(inode);
+	if (ret)
+		goto out_invalidate_lock;
 
+	/*
+	 * For journalled data we need to write (and checkpoint) pages before
+	 * discarding page cache to avoid inconsitent data on disk in case of
+	 * crash before zeroing trans is committed.
+	 */
+	if (ext4_should_journal_data(inode)) {
+		ret = filemap_write_and_wait_range(mapping, offset, end - 1);
+	} else {
 		ret = ext4_update_disksize_before_punch(inode, offset, len);
-		if (ret) {
-			filemap_invalidate_unlock(mapping);
-			goto out_mutex;
-		}
+		ext4_truncate_folios_range(inode, offset, end);
+	}
+	if (ret)
+		goto out_invalidate_lock;
 
-		/*
-		 * For journalled data we need to write (and checkpoint) pages
-		 * before discarding page cache to avoid inconsitent data on
-		 * disk in case of crash before zeroing trans is committed.
-		 */
-		if (ext4_should_journal_data(inode)) {
-			ret = filemap_write_and_wait_range(mapping, start,
-							   end - 1);
-			if (ret) {
-				filemap_invalidate_unlock(mapping);
-				goto out_mutex;
-			}
-		}
+	/* Now release the pages and zero block aligned part of pages */
+	truncate_pagecache_range(inode, offset, end - 1);
 
-		/* Now release the pages and zero block aligned part of pages */
-		ext4_truncate_folios_range(inode, start, end);
-		truncate_pagecache_range(inode, start, end - 1);
+	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
+	/* Preallocate the range including the unaligned edges */
+	if (offset & (blocksize - 1) || end & (blocksize - 1)) {
+		ext4_lblk_t alloc_lblk = offset >> blkbits;
+		ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits);
 
-		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
-					     flags);
-		filemap_invalidate_unlock(mapping);
+		ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk,
+					     new_size, flags);
 		if (ret)
-			goto out_mutex;
+			goto out_invalidate_lock;
 	}
-	if (!partial_begin && !partial_end)
-		goto out_mutex;
+
+	/* Zero range excluding the unaligned edges */
+	start_lblk = round_up(offset, blocksize) >> blkbits;
+	end_lblk = end >> blkbits;
+	if (end_lblk > start_lblk) {
+		ext4_lblk_t zero_blks = end_lblk - start_lblk;
+
+		flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE);
+		ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
+					     new_size, flags);
+		if (ret)
+			goto out_invalidate_lock;
+	}
+	/* Finish zeroing out if it doesn't contain partial block */
+	if (!(offset & (blocksize - 1)) && !(end & (blocksize - 1)))
+		goto out_invalidate_lock;
 
 	/*
 	 * In worst case we have to writeout two nonadjacent unwritten
@@ -4700,25 +4665,29 @@  static long ext4_zero_range(struct file *file, loff_t offset,
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		ext4_std_error(inode->i_sb, ret);
-		goto out_mutex;
+		goto out_invalidate_lock;
 	}
 
+	/* Zero out partial block at the edges of the range */
+	ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+	if (ret)
+		goto out_handle;
+
 	if (new_size)
 		ext4_update_inode_size(inode, new_size);
 	ret = ext4_mark_inode_dirty(handle, inode);
 	if (unlikely(ret))
 		goto out_handle;
-	/* Zero out partial block at the edges of the range */
-	ret = ext4_zero_partial_blocks(handle, inode, offset, len);
-	if (ret >= 0)
-		ext4_update_inode_fsync_trans(handle, inode, 1);
 
+	ext4_update_inode_fsync_trans(handle, inode, 1);
 	if (file->f_flags & O_SYNC)
 		ext4_handle_sync(handle);
 
 out_handle:
 	ext4_journal_stop(handle);
-out_mutex:
+out_invalidate_lock:
+	filemap_invalidate_unlock(mapping);
+out:
 	inode_unlock(inode);
 	return ret;
 }