diff mbox series

[v2] ext4: fix reserved space counter leakage

Message ID 20210823061358.84473-1-jefflexu@linux.alibaba.com
State Awaiting Upstream
Headers show
Series [v2] ext4: fix reserved space counter leakage | expand

Commit Message

Jingbo Xu Aug. 23, 2021, 6:13 a.m. UTC
When ext4_insert_delayed block receives and recovers from an error from
ext4_es_insert_delayed_block(), e.g., ENOMEM, it does not release the
space it has reserved for that block insertion as it should. One effect
of this bug is that s_dirtyclusters_counter is not decremented and
remains incorrectly elevated until the file system has been unmounted.
This can result in premature ENOSPC returns and apparent loss of free
space.

Another effect of this bug is that
/sys/fs/ext4/<dev>/delayed_allocation_blocks can remain non-zero even
after syncfs has been executed on the filesystem.

Besides, add check for s_dirtyclusters_counter when inode is going to be
evicted and freed. s_dirtyclusters_counter can still keep non-zero until
inode is written back in .evict_inode(), and thus the check is delayed
to .destroy_inode().

Fixes: 51865fda28e5 ("ext4: let ext4 maintain extent status tree")
Cc: <stable@vger.kernel.org>
Suggested-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
---
changes since v1:
- improve commit log suggested by Eric Whitney
- update "Suggested-by" title for Gao Xian, who actually found this bug
  code
- add check for s_dirtyclusters_counter in .destroy_inode()
---
 fs/ext4/inode.c | 5 +++++
 fs/ext4/super.c | 6 ++++++
 2 files changed, 11 insertions(+)

Comments

Eric Whitney Aug. 23, 2021, 8:30 p.m. UTC | #1
* Jeffle Xu <jefflexu@linux.alibaba.com>:
> When ext4_insert_delayed block receives and recovers from an error from
> ext4_es_insert_delayed_block(), e.g., ENOMEM, it does not release the
> space it has reserved for that block insertion as it should. One effect
> of this bug is that s_dirtyclusters_counter is not decremented and
> remains incorrectly elevated until the file system has been unmounted.
> This can result in premature ENOSPC returns and apparent loss of free
> space.
> 
> Another effect of this bug is that
> /sys/fs/ext4/<dev>/delayed_allocation_blocks can remain non-zero even
> after syncfs has been executed on the filesystem.
> 
> Besides, add check for s_dirtyclusters_counter when inode is going to be
> evicted and freed. s_dirtyclusters_counter can still keep non-zero until
> inode is written back in .evict_inode(), and thus the check is delayed
> to .destroy_inode().
> 
> Fixes: 51865fda28e5 ("ext4: let ext4 maintain extent status tree")
> Cc: <stable@vger.kernel.org>
> Suggested-by: Gao Xiang <hsiangkao@linux.alibaba.com>
> Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
> ---
> changes since v1:
> - improve commit log suggested by Eric Whitney
> - update "Suggested-by" title for Gao Xian, who actually found this bug
>   code
> - add check for s_dirtyclusters_counter in .destroy_inode()
> ---
>  fs/ext4/inode.c | 5 +++++
>  fs/ext4/super.c | 6 ++++++
>  2 files changed, 11 insertions(+)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index d8de607849df..73daf9443e5e 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1640,6 +1640,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
>  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>  	int ret;
>  	bool allocated = false;
> +	bool reserved = false;
>  
>  	/*
>  	 * If the cluster containing lblk is shared with a delayed,
> @@ -1656,6 +1657,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
>  		ret = ext4_da_reserve_space(inode);
>  		if (ret != 0)   /* ENOSPC */
>  			goto errout;
> +		reserved = true;
>  	} else {   /* bigalloc */
>  		if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
>  			if (!ext4_es_scan_clu(inode,
> @@ -1668,6 +1670,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
>  					ret = ext4_da_reserve_space(inode);
>  					if (ret != 0)   /* ENOSPC */
>  						goto errout;
> +					reserved = true;
>  				} else {
>  					allocated = true;
>  				}
> @@ -1678,6 +1681,8 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
>  	}
>  
>  	ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
> +	if (ret && reserved)
> +		ext4_da_release_space(inode, 1);
>  
>  errout:
>  	return ret;
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index dfa09a277b56..61bf52b58fca 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1351,6 +1351,12 @@ static void ext4_destroy_inode(struct inode *inode)
>  				true);
>  		dump_stack();
>  	}
> +
> +	if (EXT4_I(inode)->i_reserved_data_blocks)
> +		ext4_msg(inode->i_sb, KERN_ERR,
> +			 "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
> +			 inode->i_ino, EXT4_I(inode),
> +			 EXT4_I(inode)->i_reserved_data_blocks);
>  }
>  
>  static void init_once(void *foo)
> -- 
> 2.27.0
> 

Looks good, passed 4k xfstests-bld regression.  Feel free to add:

Reviewed-by: Eric Whitney <enwlinux@gmail.com>

Eric
Jingbo Xu Aug. 25, 2021, 1:38 a.m. UTC | #2
On 8/24/21 4:30 AM, Eric Whitney wrote:
> * Jeffle Xu <jefflexu@linux.alibaba.com>:
>> When ext4_insert_delayed block receives and recovers from an error from
>> ext4_es_insert_delayed_block(), e.g., ENOMEM, it does not release the
>> space it has reserved for that block insertion as it should. One effect
>> of this bug is that s_dirtyclusters_counter is not decremented and
>> remains incorrectly elevated until the file system has been unmounted.
>> This can result in premature ENOSPC returns and apparent loss of free
>> space.
>>
>> Another effect of this bug is that
>> /sys/fs/ext4/<dev>/delayed_allocation_blocks can remain non-zero even
>> after syncfs has been executed on the filesystem.
>>
>> Besides, add check for s_dirtyclusters_counter when inode is going to be
>> evicted and freed. s_dirtyclusters_counter can still keep non-zero until
>> inode is written back in .evict_inode(), and thus the check is delayed
>> to .destroy_inode().
>>
>> Fixes: 51865fda28e5 ("ext4: let ext4 maintain extent status tree")
>> Cc: <stable@vger.kernel.org>
>> Suggested-by: Gao Xiang <hsiangkao@linux.alibaba.com>
>> Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
>> ---
>> changes since v1:
>> - improve commit log suggested by Eric Whitney
>> - update "Suggested-by" title for Gao Xian, who actually found this bug
>>   code
>> - add check for s_dirtyclusters_counter in .destroy_inode()
>> ---
>>  fs/ext4/inode.c | 5 +++++
>>  fs/ext4/super.c | 6 ++++++
>>  2 files changed, 11 insertions(+)
>>
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index d8de607849df..73daf9443e5e 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -1640,6 +1640,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
>>  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>>  	int ret;
>>  	bool allocated = false;
>> +	bool reserved = false;
>>  
>>  	/*
>>  	 * If the cluster containing lblk is shared with a delayed,
>> @@ -1656,6 +1657,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
>>  		ret = ext4_da_reserve_space(inode);
>>  		if (ret != 0)   /* ENOSPC */
>>  			goto errout;
>> +		reserved = true;
>>  	} else {   /* bigalloc */
>>  		if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
>>  			if (!ext4_es_scan_clu(inode,
>> @@ -1668,6 +1670,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
>>  					ret = ext4_da_reserve_space(inode);
>>  					if (ret != 0)   /* ENOSPC */
>>  						goto errout;
>> +					reserved = true;
>>  				} else {
>>  					allocated = true;
>>  				}
>> @@ -1678,6 +1681,8 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
>>  	}
>>  
>>  	ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
>> +	if (ret && reserved)
>> +		ext4_da_release_space(inode, 1);
>>  
>>  errout:
>>  	return ret;
>> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
>> index dfa09a277b56..61bf52b58fca 100644
>> --- a/fs/ext4/super.c
>> +++ b/fs/ext4/super.c
>> @@ -1351,6 +1351,12 @@ static void ext4_destroy_inode(struct inode *inode)
>>  				true);
>>  		dump_stack();
>>  	}
>> +
>> +	if (EXT4_I(inode)->i_reserved_data_blocks)
>> +		ext4_msg(inode->i_sb, KERN_ERR,
>> +			 "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
>> +			 inode->i_ino, EXT4_I(inode),
>> +			 EXT4_I(inode)->i_reserved_data_blocks);
>>  }
>>  
>>  static void init_once(void *foo)
>> -- 
>> 2.27.0
>>
> 
> Looks good, passed 4k xfstests-bld regression.  Feel free to add:
> 
> Reviewed-by: Eric Whitney <enwlinux@gmail.com>


Hi tytso, it's a bug fix and it would be great if it could be merged to
5.15.
Jingbo Xu Sept. 6, 2021, 6:46 a.m. UTC | #3
On 8/25/21 9:38 AM, JeffleXu wrote:
> 
> 
> On 8/24/21 4:30 AM, Eric Whitney wrote:
>> * Jeffle Xu <jefflexu@linux.alibaba.com>:
>>> When ext4_insert_delayed block receives and recovers from an error from
>>> ext4_es_insert_delayed_block(), e.g., ENOMEM, it does not release the
>>> space it has reserved for that block insertion as it should. One effect
>>> of this bug is that s_dirtyclusters_counter is not decremented and
>>> remains incorrectly elevated until the file system has been unmounted.
>>> This can result in premature ENOSPC returns and apparent loss of free
>>> space.
>>>
>>> Another effect of this bug is that
>>> /sys/fs/ext4/<dev>/delayed_allocation_blocks can remain non-zero even
>>> after syncfs has been executed on the filesystem.
>>>
>>> Besides, add check for s_dirtyclusters_counter when inode is going to be
>>> evicted and freed. s_dirtyclusters_counter can still keep non-zero until
>>> inode is written back in .evict_inode(), and thus the check is delayed
>>> to .destroy_inode().
>>>
>>> Fixes: 51865fda28e5 ("ext4: let ext4 maintain extent status tree")
>>> Cc: <stable@vger.kernel.org>
>>> Suggested-by: Gao Xiang <hsiangkao@linux.alibaba.com>
>>> Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
>>> ---
>>> changes since v1:
>>> - improve commit log suggested by Eric Whitney
>>> - update "Suggested-by" title for Gao Xian, who actually found this bug
>>>   code
>>> - add check for s_dirtyclusters_counter in .destroy_inode()
>>> ---
>>>  fs/ext4/inode.c | 5 +++++
>>>  fs/ext4/super.c | 6 ++++++
>>>  2 files changed, 11 insertions(+)
>>>
>>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>>> index d8de607849df..73daf9443e5e 100644
>>> --- a/fs/ext4/inode.c
>>> +++ b/fs/ext4/inode.c
>>> @@ -1640,6 +1640,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
>>>  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>>>  	int ret;
>>>  	bool allocated = false;
>>> +	bool reserved = false;
>>>  
>>>  	/*
>>>  	 * If the cluster containing lblk is shared with a delayed,
>>> @@ -1656,6 +1657,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
>>>  		ret = ext4_da_reserve_space(inode);
>>>  		if (ret != 0)   /* ENOSPC */
>>>  			goto errout;
>>> +		reserved = true;
>>>  	} else {   /* bigalloc */
>>>  		if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
>>>  			if (!ext4_es_scan_clu(inode,
>>> @@ -1668,6 +1670,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
>>>  					ret = ext4_da_reserve_space(inode);
>>>  					if (ret != 0)   /* ENOSPC */
>>>  						goto errout;
>>> +					reserved = true;
>>>  				} else {
>>>  					allocated = true;
>>>  				}
>>> @@ -1678,6 +1681,8 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
>>>  	}
>>>  
>>>  	ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
>>> +	if (ret && reserved)
>>> +		ext4_da_release_space(inode, 1);
>>>  
>>>  errout:
>>>  	return ret;
>>> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
>>> index dfa09a277b56..61bf52b58fca 100644
>>> --- a/fs/ext4/super.c
>>> +++ b/fs/ext4/super.c
>>> @@ -1351,6 +1351,12 @@ static void ext4_destroy_inode(struct inode *inode)
>>>  				true);
>>>  		dump_stack();
>>>  	}
>>> +
>>> +	if (EXT4_I(inode)->i_reserved_data_blocks)
>>> +		ext4_msg(inode->i_sb, KERN_ERR,
>>> +			 "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
>>> +			 inode->i_ino, EXT4_I(inode),
>>> +			 EXT4_I(inode)->i_reserved_data_blocks);
>>>  }
>>>  
>>>  static void init_once(void *foo)
>>> -- 
>>> 2.27.0
>>>
>>
>> Looks good, passed 4k xfstests-bld regression.  Feel free to add:
>>
>> Reviewed-by: Eric Whitney <enwlinux@gmail.com>
> 
> 
> Hi tytso, it's a bug fix and it would be great if it could be merged to
> 5.15.
> 

ping ...
Gao Xiang Sept. 10, 2021, 5:48 a.m. UTC | #4
Hi Ted,

On Mon, Sep 06, 2021 at 02:46:16PM +0800, JeffleXu wrote:
> 
> 
> On 8/25/21 9:38 AM, JeffleXu wrote:
> > 
> > 
> > On 8/24/21 4:30 AM, Eric Whitney wrote:
> >> * Jeffle Xu <jefflexu@linux.alibaba.com>:
> >>> When ext4_insert_delayed block receives and recovers from an error from
> >>> ext4_es_insert_delayed_block(), e.g., ENOMEM, it does not release the
> >>> space it has reserved for that block insertion as it should. One effect
> >>> of this bug is that s_dirtyclusters_counter is not decremented and
> >>> remains incorrectly elevated until the file system has been unmounted.
> >>> This can result in premature ENOSPC returns and apparent loss of free
> >>> space.
> >>>
> >>> Another effect of this bug is that
> >>> /sys/fs/ext4/<dev>/delayed_allocation_blocks can remain non-zero even
> >>> after syncfs has been executed on the filesystem.
> >>>
> >>> Besides, add check for s_dirtyclusters_counter when inode is going to be
> >>> evicted and freed. s_dirtyclusters_counter can still keep non-zero until
> >>> inode is written back in .evict_inode(), and thus the check is delayed
> >>> to .destroy_inode().
> >>>
> >>> Fixes: 51865fda28e5 ("ext4: let ext4 maintain extent status tree")
> >>> Cc: <stable@vger.kernel.org>
> >>> Suggested-by: Gao Xiang <hsiangkao@linux.alibaba.com>
> >>> Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
> >>> ---
> >>> changes since v1:
> >>> - improve commit log suggested by Eric Whitney
> >>> - update "Suggested-by" title for Gao Xian, who actually found this bug
> >>>   code
> >>> - add check for s_dirtyclusters_counter in .destroy_inode()
> >>> ---
> >>>  fs/ext4/inode.c | 5 +++++
> >>>  fs/ext4/super.c | 6 ++++++
> >>>  2 files changed, 11 insertions(+)
> >>>
> >>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> >>> index d8de607849df..73daf9443e5e 100644
> >>> --- a/fs/ext4/inode.c
> >>> +++ b/fs/ext4/inode.c
> >>> @@ -1640,6 +1640,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
> >>>  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> >>>  	int ret;
> >>>  	bool allocated = false;
> >>> +	bool reserved = false;
> >>>  
> >>>  	/*
> >>>  	 * If the cluster containing lblk is shared with a delayed,
> >>> @@ -1656,6 +1657,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
> >>>  		ret = ext4_da_reserve_space(inode);
> >>>  		if (ret != 0)   /* ENOSPC */
> >>>  			goto errout;
> >>> +		reserved = true;
> >>>  	} else {   /* bigalloc */
> >>>  		if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
> >>>  			if (!ext4_es_scan_clu(inode,
> >>> @@ -1668,6 +1670,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
> >>>  					ret = ext4_da_reserve_space(inode);
> >>>  					if (ret != 0)   /* ENOSPC */
> >>>  						goto errout;
> >>> +					reserved = true;
> >>>  				} else {
> >>>  					allocated = true;
> >>>  				}
> >>> @@ -1678,6 +1681,8 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
> >>>  	}
> >>>  
> >>>  	ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
> >>> +	if (ret && reserved)
> >>> +		ext4_da_release_space(inode, 1);
> >>>  
> >>>  errout:
> >>>  	return ret;
> >>> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> >>> index dfa09a277b56..61bf52b58fca 100644
> >>> --- a/fs/ext4/super.c
> >>> +++ b/fs/ext4/super.c
> >>> @@ -1351,6 +1351,12 @@ static void ext4_destroy_inode(struct inode *inode)
> >>>  				true);
> >>>  		dump_stack();
> >>>  	}
> >>> +
> >>> +	if (EXT4_I(inode)->i_reserved_data_blocks)
> >>> +		ext4_msg(inode->i_sb, KERN_ERR,
> >>> +			 "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
> >>> +			 inode->i_ino, EXT4_I(inode),
> >>> +			 EXT4_I(inode)->i_reserved_data_blocks);
> >>>  }
> >>>  
> >>>  static void init_once(void *foo)
> >>> -- 
> >>> 2.27.0
> >>>
> >>
> >> Looks good, passed 4k xfstests-bld regression.  Feel free to add:
> >>
> >> Reviewed-by: Eric Whitney <enwlinux@gmail.com>
> > 
> > 
> > Hi tytso, it's a bug fix and it would be great if it could be merged to
> > 5.15.
> > 
> 
> ping ...

It seems somewhat a real issue and has some impact in our production.
Would you mind giving some hints if it could be resolved in this way?

Or if some other concerns / solution about this?

Thanks,
Gao Xiang

> 
> -- 
> Thanks,
> Jeffle
Theodore Ts'o Sept. 16, 2021, 3:01 a.m. UTC | #5
On Mon, 23 Aug 2021 14:13:58 +0800, Jeffle Xu wrote:
> When ext4_insert_delayed block receives and recovers from an error from
> ext4_es_insert_delayed_block(), e.g., ENOMEM, it does not release the
> space it has reserved for that block insertion as it should. One effect
> of this bug is that s_dirtyclusters_counter is not decremented and
> remains incorrectly elevated until the file system has been unmounted.
> This can result in premature ENOSPC returns and apparent loss of free
> space.
> 
> [...]

Applied, thanks!

[1/1] ext4: fix reserved space counter leakage
      commit: 8ec5146a46609a89658e5dc9bc40b70ce70ef43b

Best regards,
diff mbox series

Patch

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d8de607849df..73daf9443e5e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1640,6 +1640,7 @@  static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	int ret;
 	bool allocated = false;
+	bool reserved = false;
 
 	/*
 	 * If the cluster containing lblk is shared with a delayed,
@@ -1656,6 +1657,7 @@  static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
 		ret = ext4_da_reserve_space(inode);
 		if (ret != 0)   /* ENOSPC */
 			goto errout;
+		reserved = true;
 	} else {   /* bigalloc */
 		if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
 			if (!ext4_es_scan_clu(inode,
@@ -1668,6 +1670,7 @@  static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
 					ret = ext4_da_reserve_space(inode);
 					if (ret != 0)   /* ENOSPC */
 						goto errout;
+					reserved = true;
 				} else {
 					allocated = true;
 				}
@@ -1678,6 +1681,8 @@  static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
 	}
 
 	ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
+	if (ret && reserved)
+		ext4_da_release_space(inode, 1);
 
 errout:
 	return ret;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dfa09a277b56..61bf52b58fca 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1351,6 +1351,12 @@  static void ext4_destroy_inode(struct inode *inode)
 				true);
 		dump_stack();
 	}
+
+	if (EXT4_I(inode)->i_reserved_data_blocks)
+		ext4_msg(inode->i_sb, KERN_ERR,
+			 "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
+			 inode->i_ino, EXT4_I(inode),
+			 EXT4_I(inode)->i_reserved_data_blocks);
 }
 
 static void init_once(void *foo)