diff mbox series

[3/4] ext4: avoid bb_free and bb_fragments inconsistency in mb_free_blocks()

Message ID 20231218141814.1477338-4-libaokun1@huawei.com
State Not Applicable
Headers show
Series ext4: fix divide error in mb_update_avg_fragment_size() | expand

Commit Message

Baokun Li Dec. 18, 2023, 2:18 p.m. UTC
After updating bb_free in mb_free_blocks, it is possible to return without
updating bb_fragments because the block being freed is found to have
already been freed, which leads to inconsistency between bb_free and
bb_fragments.

Since the group may be unlocked in ext4_grp_locked_error(), this can lead
to problems such as dividing by zero when calculating the average fragment
length. Therefore, to ensure consistency, move the update of bb_free to
after the block double-free check.

Fixes: eabe0444df90 ("ext4: speed-up releasing blocks on commit")
CC: stable@vger.kernel.org # 3.10
Signed-off-by: Baokun Li <libaokun1@huawei.com>
---
 fs/ext4/mballoc.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

Comments

Jan Kara Dec. 18, 2023, 3:14 p.m. UTC | #1
On Mon 18-12-23 22:18:13, Baokun Li wrote:
> After updating bb_free in mb_free_blocks, it is possible to return without
> updating bb_fragments because the block being freed is found to have
> already been freed, which leads to inconsistency between bb_free and
> bb_fragments.
> 
> Since the group may be unlocked in ext4_grp_locked_error(), this can lead
> to problems such as dividing by zero when calculating the average fragment
> length. Therefore, to ensure consistency, move the update of bb_free to
> after the block double-free check.
> 
> Fixes: eabe0444df90 ("ext4: speed-up releasing blocks on commit")
> CC: stable@vger.kernel.org # 3.10
> Signed-off-by: Baokun Li <libaokun1@huawei.com>

I agree there's no point in updating the allocation info if the bitmap is
corrupted. We will not try to allocate (or free) blocks in that group
anymore. I'm just a bit unsure about the EXT4_FC_REPLAY state where we
don't mark the bitmap as corrupted although some blocks were already marked
as freed. So in this case the free space statistics tracking will go
permanently wrong. I'm kind of wondering in which case does fast-commit
free already freed blocks. Ted, any idea?

								Honza


> ---
>  fs/ext4/mballoc.c | 13 ++++++-------
>  1 file changed, 6 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index a95fa6e2b0f9..2fbee0f0f5c3 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -1892,11 +1892,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
>  	mb_check_buddy(e4b);
>  	mb_free_blocks_double(inode, e4b, first, count);
>  
> -	this_cpu_inc(discard_pa_seq);
> -	e4b->bd_info->bb_free += count;
> -	if (first < e4b->bd_info->bb_first_free)
> -		e4b->bd_info->bb_first_free = first;
> -
>  	/* access memory sequentially: check left neighbour,
>  	 * clear range and then check right neighbour
>  	 */
> @@ -1922,9 +1917,14 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
>  				sb, e4b->bd_group,
>  				EXT4_GROUP_INFO_BBITMAP_CORRUPT);
>  		}
> -		goto done;
> +		return;
>  	}
>  
> +	this_cpu_inc(discard_pa_seq);
> +	e4b->bd_info->bb_free += count;
> +	if (first < e4b->bd_info->bb_first_free)
> +		e4b->bd_info->bb_first_free = first;
> +
>  	/* let's maintain fragments counter */
>  	if (left_is_free && right_is_free)
>  		e4b->bd_info->bb_fragments--;
> @@ -1949,7 +1949,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
>  	if (first <= last)
>  		mb_buddy_mark_free(e4b, first >> 1, last >> 1);
>  
> -done:
>  	mb_set_largest_free_order(sb, e4b->bd_info);
>  	mb_update_avg_fragment_size(sb, e4b->bd_info);
>  	mb_check_buddy(e4b);
> -- 
> 2.31.1
>
Baokun Li Dec. 19, 2023, 2:29 a.m. UTC | #2
On 2023/12/18 23:14, Jan Kara wrote:
> On Mon 18-12-23 22:18:13, Baokun Li wrote:
>> After updating bb_free in mb_free_blocks, it is possible to return without
>> updating bb_fragments because the block being freed is found to have
>> already been freed, which leads to inconsistency between bb_free and
>> bb_fragments.
>>
>> Since the group may be unlocked in ext4_grp_locked_error(), this can lead
>> to problems such as dividing by zero when calculating the average fragment
>> length. Therefore, to ensure consistency, move the update of bb_free to
>> after the block double-free check.
>>
>> Fixes: eabe0444df90 ("ext4: speed-up releasing blocks on commit")
>> CC: stable@vger.kernel.org # 3.10
>> Signed-off-by: Baokun Li <libaokun1@huawei.com>
> I agree there's no point in updating the allocation info if the bitmap is
> corrupted. We will not try to allocate (or free) blocks in that group
> anymore. I'm just a bit unsure about the EXT4_FC_REPLAY state where we
> don't mark the bitmap as corrupted although some blocks were already marked
> as freed. So in this case the free space statistics tracking will go
> permanently wrong. I'm kind of wondering in which case does fast-commit
> free already freed blocks. Ted, any idea?
>
> 								Honza
Some additional information, this judgment was introduced in
commit 8016e29f4362 ("ext4: fast commit recovery path") in v5.10-rc1,
at which point mb_regenerate_buddy() was called to regenerate the
buddy when it was found to be freeing a block that had already been
freed, so there was no problem. Until v5.11-rc1 commit 6bd97bf273bd
("ext4: remove redundant mb_regenerate_buddy()") removes the logic
to regenerate the buddy, it looks like the free space statistics will
remain wrong. If this normal scenario exists, perhaps buddy should
be regenerated here?

Thanks,
Baokun
>> ---
>>   fs/ext4/mballoc.c | 13 ++++++-------
>>   1 file changed, 6 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>> index a95fa6e2b0f9..2fbee0f0f5c3 100644
>> --- a/fs/ext4/mballoc.c
>> +++ b/fs/ext4/mballoc.c
>> @@ -1892,11 +1892,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
>>   	mb_check_buddy(e4b);
>>   	mb_free_blocks_double(inode, e4b, first, count);
>>   
>> -	this_cpu_inc(discard_pa_seq);
>> -	e4b->bd_info->bb_free += count;
>> -	if (first < e4b->bd_info->bb_first_free)
>> -		e4b->bd_info->bb_first_free = first;
>> -
>>   	/* access memory sequentially: check left neighbour,
>>   	 * clear range and then check right neighbour
>>   	 */
>> @@ -1922,9 +1917,14 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
>>   				sb, e4b->bd_group,
>>   				EXT4_GROUP_INFO_BBITMAP_CORRUPT);
>>   		}
>> -		goto done;
>> +		return;
>>   	}
>>   
>> +	this_cpu_inc(discard_pa_seq);
>> +	e4b->bd_info->bb_free += count;
>> +	if (first < e4b->bd_info->bb_first_free)
>> +		e4b->bd_info->bb_first_free = first;
>> +
>>   	/* let's maintain fragments counter */
>>   	if (left_is_free && right_is_free)
>>   		e4b->bd_info->bb_fragments--;
>> @@ -1949,7 +1949,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
>>   	if (first <= last)
>>   		mb_buddy_mark_free(e4b, first >> 1, last >> 1);
>>   
>> -done:
>>   	mb_set_largest_free_order(sb, e4b->bd_info);
>>   	mb_update_avg_fragment_size(sb, e4b->bd_info);
>>   	mb_check_buddy(e4b);
>> -- 
>> 2.31.1
>>
Baokun Li Dec. 20, 2023, 9 a.m. UTC | #3
On 2023/12/18 23:14, Jan Kara wrote:
> On Mon 18-12-23 22:18:13, Baokun Li wrote:
>> After updating bb_free in mb_free_blocks, it is possible to return without
>> updating bb_fragments because the block being freed is found to have
>> already been freed, which leads to inconsistency between bb_free and
>> bb_fragments.
>>
>> Since the group may be unlocked in ext4_grp_locked_error(), this can lead
>> to problems such as dividing by zero when calculating the average fragment
>> length. Therefore, to ensure consistency, move the update of bb_free to
>> after the block double-free check.
>>
>> Fixes: eabe0444df90 ("ext4: speed-up releasing blocks on commit")
>> CC: stable@vger.kernel.org # 3.10
>> Signed-off-by: Baokun Li <libaokun1@huawei.com>
> I agree there's no point in updating the allocation info if the bitmap is
> corrupted. We will not try to allocate (or free) blocks in that group
> anymore. I'm just a bit unsure about the EXT4_FC_REPLAY state where we
> don't mark the bitmap as corrupted although some blocks were already marked
> as freed. So in this case the free space statistics tracking will go
> permanently wrong. I'm kind of wondering in which case does fast-commit
> free already freed blocks. Ted, any idea?
>
> 								Honza
>
Hello Harshad!

Seeing that earlier you added EXT4_FC_REPLAY related judgment to
mb_free_blocks() in commit 8016e29f4362 ("ext4: fast commit recovery path"),
when there is EXT4_FC_REPLAY, even when freeing blocks that have already
been freed, the block bitmap is not marked as corrupted, is there a known
scenario here?

I would be grateful if you could shed some light on this!


Thanks,
Baokun
>> ---
>>   fs/ext4/mballoc.c | 13 ++++++-------
>>   1 file changed, 6 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>> index a95fa6e2b0f9..2fbee0f0f5c3 100644
>> --- a/fs/ext4/mballoc.c
>> +++ b/fs/ext4/mballoc.c
>> @@ -1892,11 +1892,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
>>   	mb_check_buddy(e4b);
>>   	mb_free_blocks_double(inode, e4b, first, count);
>>   
>> -	this_cpu_inc(discard_pa_seq);
>> -	e4b->bd_info->bb_free += count;
>> -	if (first < e4b->bd_info->bb_first_free)
>> -		e4b->bd_info->bb_first_free = first;
>> -
>>   	/* access memory sequentially: check left neighbour,
>>   	 * clear range and then check right neighbour
>>   	 */
>> @@ -1922,9 +1917,14 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
>>   				sb, e4b->bd_group,
>>   				EXT4_GROUP_INFO_BBITMAP_CORRUPT);
>>   		}
>> -		goto done;
>> +		return;
>>   	}
>>   
>> +	this_cpu_inc(discard_pa_seq);
>> +	e4b->bd_info->bb_free += count;
>> +	if (first < e4b->bd_info->bb_first_free)
>> +		e4b->bd_info->bb_first_free = first;
>> +
>>   	/* let's maintain fragments counter */
>>   	if (left_is_free && right_is_free)
>>   		e4b->bd_info->bb_fragments--;
>> @@ -1949,7 +1949,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
>>   	if (first <= last)
>>   		mb_buddy_mark_free(e4b, first >> 1, last >> 1);
>>   
>> -done:
>>   	mb_set_largest_free_order(sb, e4b->bd_info);
>>   	mb_update_avg_fragment_size(sb, e4b->bd_info);
>>   	mb_check_buddy(e4b);
>> -- 
>> 2.31.1
>>
diff mbox series

Patch

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a95fa6e2b0f9..2fbee0f0f5c3 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1892,11 +1892,6 @@  static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 	mb_check_buddy(e4b);
 	mb_free_blocks_double(inode, e4b, first, count);
 
-	this_cpu_inc(discard_pa_seq);
-	e4b->bd_info->bb_free += count;
-	if (first < e4b->bd_info->bb_first_free)
-		e4b->bd_info->bb_first_free = first;
-
 	/* access memory sequentially: check left neighbour,
 	 * clear range and then check right neighbour
 	 */
@@ -1922,9 +1917,14 @@  static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 				sb, e4b->bd_group,
 				EXT4_GROUP_INFO_BBITMAP_CORRUPT);
 		}
-		goto done;
+		return;
 	}
 
+	this_cpu_inc(discard_pa_seq);
+	e4b->bd_info->bb_free += count;
+	if (first < e4b->bd_info->bb_first_free)
+		e4b->bd_info->bb_first_free = first;
+
 	/* let's maintain fragments counter */
 	if (left_is_free && right_is_free)
 		e4b->bd_info->bb_fragments--;
@@ -1949,7 +1949,6 @@  static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 	if (first <= last)
 		mb_buddy_mark_free(e4b, first >> 1, last >> 1);
 
-done:
 	mb_set_largest_free_order(sb, e4b->bd_info);
 	mb_update_avg_fragment_size(sb, e4b->bd_info);
 	mb_check_buddy(e4b);