diff mbox series

[4/7] ext4: add positive int attr pointer to avoid sysfs variables overflow

Message ID 20240126085716.1363019-5-libaokun1@huawei.com
State Superseded
Headers show
Series ext4: avoid sysfs variables overflow causing BUG_ON/SOOB | expand

Commit Message

Baokun Li Jan. 26, 2024, 8:57 a.m. UTC
We can easily trigger a BUG_ON by using the following commands:

    mount /dev/$disk /tmp/test
    echo 2147483650 > /sys/fs/ext4/$disk/mb_group_prealloc
    echo test > /tmp/test/file && sync

==================================================================
kernel BUG at fs/ext4/mballoc.c:2029!
invalid opcode: 0000 [#1] PREEMPT SMP PTI
CPU: 3 PID: 320 Comm: kworker/u36:1 Not tainted 6.8.0-rc1 #462
RIP: 0010:mb_mark_used+0x358/0x370
[...]
Call Trace:
 ext4_mb_use_best_found+0x56/0x140
 ext4_mb_complex_scan_group+0x196/0x2f0
 ext4_mb_regular_allocator+0xa92/0xf00
 ext4_mb_new_blocks+0x302/0xbc0
 ext4_ext_map_blocks+0x95a/0xef0
 ext4_map_blocks+0x2b1/0x680
 ext4_do_writepages+0x733/0xbd0
[...]
==================================================================

In ext4_mb_normalize_group_request():
    ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;

Here fe_len is of type int, but s_mb_group_prealloc is of type unsigned
int, so setting s_mb_group_prealloc to 2147483650 overflows fe_len to a
negative number, which ultimately triggers a BUG_ON() in mb_mark_used().

Therefore, we add attr_pointer_pi (aka positive int attr pointer) with a
value range of 0-INT_MAX to avoid the above problem. In addition to the
mb_group_prealloc sysfs interface, the following interfaces also have uint
to int conversions that result in overflows, and are also fixed.

  err_ratelimit_burst
  msg_ratelimit_burst
  warning_ratelimit_burst
  err_ratelimit_interval_ms
  msg_ratelimit_interval_ms
  warning_ratelimit_interval_ms
  mb_best_avail_max_trim_order

CC: stable@vger.kernel.org
Signed-off-by: Baokun Li <libaokun1@huawei.com>
---
 fs/ext4/sysfs.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

Comments

Zhang Yi Jan. 27, 2024, 2:07 a.m. UTC | #1
On 2024/1/26 16:57, Baokun Li wrote:
> We can easily trigger a BUG_ON by using the following commands:
> 
>     mount /dev/$disk /tmp/test
>     echo 2147483650 > /sys/fs/ext4/$disk/mb_group_prealloc
>     echo test > /tmp/test/file && sync
> 
> ==================================================================
> kernel BUG at fs/ext4/mballoc.c:2029!
> invalid opcode: 0000 [#1] PREEMPT SMP PTI
> CPU: 3 PID: 320 Comm: kworker/u36:1 Not tainted 6.8.0-rc1 #462
> RIP: 0010:mb_mark_used+0x358/0x370
> [...]
> Call Trace:
>  ext4_mb_use_best_found+0x56/0x140
>  ext4_mb_complex_scan_group+0x196/0x2f0
>  ext4_mb_regular_allocator+0xa92/0xf00
>  ext4_mb_new_blocks+0x302/0xbc0
>  ext4_ext_map_blocks+0x95a/0xef0
>  ext4_map_blocks+0x2b1/0x680
>  ext4_do_writepages+0x733/0xbd0
> [...]
> ==================================================================
> 
> In ext4_mb_normalize_group_request():
>     ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
> 
> Here fe_len is of type int, but s_mb_group_prealloc is of type unsigned
> int, so setting s_mb_group_prealloc to 2147483650 overflows fe_len to a
> negative number, which ultimately triggers a BUG_ON() in mb_mark_used().
> 
> Therefore, we add attr_pointer_pi (aka positive int attr pointer) with a
> value range of 0-INT_MAX to avoid the above problem. In addition to the
> mb_group_prealloc sysfs interface, the following interfaces also have uint
> to int conversions that result in overflows, and are also fixed.
> 
>   err_ratelimit_burst
>   msg_ratelimit_burst
>   warning_ratelimit_burst
>   err_ratelimit_interval_ms
>   msg_ratelimit_interval_ms
>   warning_ratelimit_interval_ms
>   mb_best_avail_max_trim_order
> 
> CC: stable@vger.kernel.org
> Signed-off-by: Baokun Li <libaokun1@huawei.com>

Looks good to me.

Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

> ---
>  fs/ext4/sysfs.c | 25 +++++++++++++++++--------
>  1 file changed, 17 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index a5d657fa05cb..6f9f96e00f2f 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -30,6 +30,7 @@ typedef enum {
>  	attr_first_error_time,
>  	attr_last_error_time,
>  	attr_feature,
> +	attr_pointer_pi,
>  	attr_pointer_ui,
>  	attr_pointer_ul,
>  	attr_pointer_u64,
> @@ -178,6 +179,9 @@ static struct ext4_attr ext4_attr_##_name = {			\
>  #define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size)			\
>  	EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname)
>  
> +#define EXT4_RW_ATTR_SBI_PI(_name,_elname)      \
> +	EXT4_ATTR_OFFSET(_name, 0644, pointer_pi, ext4_sb_info, _elname)
> +
>  #define EXT4_RW_ATTR_SBI_UI(_name,_elname)	\
>  	EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)
>  
> @@ -213,17 +217,17 @@ EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
>  EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
>  EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
>  EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
> -EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
> +EXT4_RW_ATTR_SBI_PI(mb_group_prealloc, s_mb_group_prealloc);
>  EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups);
>  EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
>  EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
> -EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
> -EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
> -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
> -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
> -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
> -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
> -EXT4_RW_ATTR_SBI_UI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order);
> +EXT4_RW_ATTR_SBI_PI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
> +EXT4_RW_ATTR_SBI_PI(err_ratelimit_burst, s_err_ratelimit_state.burst);
> +EXT4_RW_ATTR_SBI_PI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
> +EXT4_RW_ATTR_SBI_PI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
> +EXT4_RW_ATTR_SBI_PI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
> +EXT4_RW_ATTR_SBI_PI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
> +EXT4_RW_ATTR_SBI_PI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order);
>  #ifdef CONFIG_EXT4_DEBUG
>  EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
>  #endif
> @@ -376,6 +380,7 @@ static ssize_t ext4_generic_attr_show(struct ext4_attr *a,
>  
>  	switch (a->attr_id) {
>  	case attr_inode_readahead:
> +	case attr_pointer_pi:
>  	case attr_pointer_ui:
>  		if (a->attr_ptr == ptr_ext4_super_block_offset)
>  			return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr));
> @@ -448,6 +453,10 @@ static ssize_t ext4_generic_attr_store(struct ext4_attr *a,
>  		return ret;
>  
>  	switch (a->attr_id) {
> +	case attr_pointer_pi:
> +		if ((int)t < 0)
> +			return -EINVAL;
> +		fallthrough;
>  	case attr_pointer_ui:
>  		if (t != (unsigned int)t)
>  			return -EINVAL;
>
Jan Kara Feb. 13, 2024, 4:58 p.m. UTC | #2
On Fri 26-01-24 16:57:13, Baokun Li wrote:
> We can easily trigger a BUG_ON by using the following commands:
> 
>     mount /dev/$disk /tmp/test
>     echo 2147483650 > /sys/fs/ext4/$disk/mb_group_prealloc
>     echo test > /tmp/test/file && sync
> 
> ==================================================================
> kernel BUG at fs/ext4/mballoc.c:2029!
> invalid opcode: 0000 [#1] PREEMPT SMP PTI
> CPU: 3 PID: 320 Comm: kworker/u36:1 Not tainted 6.8.0-rc1 #462
> RIP: 0010:mb_mark_used+0x358/0x370
> [...]
> Call Trace:
>  ext4_mb_use_best_found+0x56/0x140
>  ext4_mb_complex_scan_group+0x196/0x2f0
>  ext4_mb_regular_allocator+0xa92/0xf00
>  ext4_mb_new_blocks+0x302/0xbc0
>  ext4_ext_map_blocks+0x95a/0xef0
>  ext4_map_blocks+0x2b1/0x680
>  ext4_do_writepages+0x733/0xbd0
> [...]
> ==================================================================
> 
> In ext4_mb_normalize_group_request():
>     ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
> 
> Here fe_len is of type int, but s_mb_group_prealloc is of type unsigned
> int, so setting s_mb_group_prealloc to 2147483650 overflows fe_len to a
> negative number, which ultimately triggers a BUG_ON() in mb_mark_used().
> 
> Therefore, we add attr_pointer_pi (aka positive int attr pointer) with a
> value range of 0-INT_MAX to avoid the above problem. In addition to the
> mb_group_prealloc sysfs interface, the following interfaces also have uint
> to int conversions that result in overflows, and are also fixed.
> 
>   err_ratelimit_burst
>   msg_ratelimit_burst
>   warning_ratelimit_burst
>   err_ratelimit_interval_ms
>   msg_ratelimit_interval_ms
>   warning_ratelimit_interval_ms
>   mb_best_avail_max_trim_order
> 
> CC: stable@vger.kernel.org
> Signed-off-by: Baokun Li <libaokun1@huawei.com>

I don't think you need to change s_mb_group_prealloc here and then restrict
it even further in the next patch. I'd just leave it alone here.

Also I think that limiting mb_best_avail_max_trim_order to 64 instead of
INT_MAX will make us more resilient to surprises in the future :) But I
don't really insist.

								Honza

> ---
>  fs/ext4/sysfs.c | 25 +++++++++++++++++--------
>  1 file changed, 17 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index a5d657fa05cb..6f9f96e00f2f 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -30,6 +30,7 @@ typedef enum {
>  	attr_first_error_time,
>  	attr_last_error_time,
>  	attr_feature,
> +	attr_pointer_pi,
>  	attr_pointer_ui,
>  	attr_pointer_ul,
>  	attr_pointer_u64,
> @@ -178,6 +179,9 @@ static struct ext4_attr ext4_attr_##_name = {			\
>  #define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size)			\
>  	EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname)
>  
> +#define EXT4_RW_ATTR_SBI_PI(_name,_elname)      \
> +	EXT4_ATTR_OFFSET(_name, 0644, pointer_pi, ext4_sb_info, _elname)
> +
>  #define EXT4_RW_ATTR_SBI_UI(_name,_elname)	\
>  	EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)
>  
> @@ -213,17 +217,17 @@ EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
>  EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
>  EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
>  EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
> -EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
> +EXT4_RW_ATTR_SBI_PI(mb_group_prealloc, s_mb_group_prealloc);
>  EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups);
>  EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
>  EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
> -EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
> -EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
> -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
> -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
> -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
> -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
> -EXT4_RW_ATTR_SBI_UI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order);
> +EXT4_RW_ATTR_SBI_PI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
> +EXT4_RW_ATTR_SBI_PI(err_ratelimit_burst, s_err_ratelimit_state.burst);
> +EXT4_RW_ATTR_SBI_PI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
> +EXT4_RW_ATTR_SBI_PI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
> +EXT4_RW_ATTR_SBI_PI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
> +EXT4_RW_ATTR_SBI_PI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
> +EXT4_RW_ATTR_SBI_PI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order);
>  #ifdef CONFIG_EXT4_DEBUG
>  EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
>  #endif
> @@ -376,6 +380,7 @@ static ssize_t ext4_generic_attr_show(struct ext4_attr *a,
>  
>  	switch (a->attr_id) {
>  	case attr_inode_readahead:
> +	case attr_pointer_pi:
>  	case attr_pointer_ui:
>  		if (a->attr_ptr == ptr_ext4_super_block_offset)
>  			return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr));
> @@ -448,6 +453,10 @@ static ssize_t ext4_generic_attr_store(struct ext4_attr *a,
>  		return ret;
>  
>  	switch (a->attr_id) {
> +	case attr_pointer_pi:
> +		if ((int)t < 0)
> +			return -EINVAL;
> +		fallthrough;
>  	case attr_pointer_ui:
>  		if (t != (unsigned int)t)
>  			return -EINVAL;
> -- 
> 2.31.1
>
Baokun Li Feb. 17, 2024, 7:41 a.m. UTC | #3
On 2024/2/14 0:58, Jan Kara wrote:
> On Fri 26-01-24 16:57:13, Baokun Li wrote:
>> We can easily trigger a BUG_ON by using the following commands:
>>
>>      mount /dev/$disk /tmp/test
>>      echo 2147483650 > /sys/fs/ext4/$disk/mb_group_prealloc
>>      echo test > /tmp/test/file && sync
>>
>> ==================================================================
>> kernel BUG at fs/ext4/mballoc.c:2029!
>> invalid opcode: 0000 [#1] PREEMPT SMP PTI
>> CPU: 3 PID: 320 Comm: kworker/u36:1 Not tainted 6.8.0-rc1 #462
>> RIP: 0010:mb_mark_used+0x358/0x370
>> [...]
>> Call Trace:
>>   ext4_mb_use_best_found+0x56/0x140
>>   ext4_mb_complex_scan_group+0x196/0x2f0
>>   ext4_mb_regular_allocator+0xa92/0xf00
>>   ext4_mb_new_blocks+0x302/0xbc0
>>   ext4_ext_map_blocks+0x95a/0xef0
>>   ext4_map_blocks+0x2b1/0x680
>>   ext4_do_writepages+0x733/0xbd0
>> [...]
>> ==================================================================
>>
>> In ext4_mb_normalize_group_request():
>>      ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
>>
>> Here fe_len is of type int, but s_mb_group_prealloc is of type unsigned
>> int, so setting s_mb_group_prealloc to 2147483650 overflows fe_len to a
>> negative number, which ultimately triggers a BUG_ON() in mb_mark_used().
>>
>> Therefore, we add attr_pointer_pi (aka positive int attr pointer) with a
>> value range of 0-INT_MAX to avoid the above problem. In addition to the
>> mb_group_prealloc sysfs interface, the following interfaces also have uint
>> to int conversions that result in overflows, and are also fixed.
>>
>>    err_ratelimit_burst
>>    msg_ratelimit_burst
>>    warning_ratelimit_burst
>>    err_ratelimit_interval_ms
>>    msg_ratelimit_interval_ms
>>    warning_ratelimit_interval_ms
>>    mb_best_avail_max_trim_order
>>
>> CC: stable@vger.kernel.org
>> Signed-off-by: Baokun Li <libaokun1@huawei.com>
> I don't think you need to change s_mb_group_prealloc here and then restrict
> it even further in the next patch. I'd just leave it alone here.
Yes, we could put the next patch before this one, but using
s_mb_group_prealloc as an example makes it easier to understand
why the attr_pointer_pi case is added here.There are several other
variables that don't have more convincing examples.
>
> Also I think that limiting mb_best_avail_max_trim_order to 64 instead of
> INT_MAX will make us more resilient to surprises in the future :) But I
> don't really insist.
>
> 								Honza
I think it's enough here to make sure that mb_best_avail_max_trim_order
is a positive number, since we always make sure that min_order
is not less than 0, as follows:

          order = fls(ac->ac_g_ex.fe_len) - 1;
          min_order = order - sbi->s_mb_best_avail_max_trim_order;
          if (min_order < 0)
                  min_order = 0;

An oversized mb_best_avail_max_trim_order can be interpreted as
always being CR_ANY_FREE. 😄
>> ---
>>   fs/ext4/sysfs.c | 25 +++++++++++++++++--------
>>   1 file changed, 17 insertions(+), 8 deletions(-)
>>
>> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
>> index a5d657fa05cb..6f9f96e00f2f 100644
>> --- a/fs/ext4/sysfs.c
>> +++ b/fs/ext4/sysfs.c
>> @@ -30,6 +30,7 @@ typedef enum {
>>   	attr_first_error_time,
>>   	attr_last_error_time,
>>   	attr_feature,
>> +	attr_pointer_pi,
>>   	attr_pointer_ui,
>>   	attr_pointer_ul,
>>   	attr_pointer_u64,
>> @@ -178,6 +179,9 @@ static struct ext4_attr ext4_attr_##_name = {			\
>>   #define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size)			\
>>   	EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname)
>>   
>> +#define EXT4_RW_ATTR_SBI_PI(_name,_elname)      \
>> +	EXT4_ATTR_OFFSET(_name, 0644, pointer_pi, ext4_sb_info, _elname)
>> +
>>   #define EXT4_RW_ATTR_SBI_UI(_name,_elname)	\
>>   	EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)
>>   
>> @@ -213,17 +217,17 @@ EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
>>   EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
>>   EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
>>   EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
>> -EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
>> +EXT4_RW_ATTR_SBI_PI(mb_group_prealloc, s_mb_group_prealloc);
>>   EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups);
>>   EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
>>   EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
>> -EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
>> -EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
>> -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
>> -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
>> -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
>> -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
>> -EXT4_RW_ATTR_SBI_UI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order);
>> +EXT4_RW_ATTR_SBI_PI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
>> +EXT4_RW_ATTR_SBI_PI(err_ratelimit_burst, s_err_ratelimit_state.burst);
>> +EXT4_RW_ATTR_SBI_PI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
>> +EXT4_RW_ATTR_SBI_PI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
>> +EXT4_RW_ATTR_SBI_PI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
>> +EXT4_RW_ATTR_SBI_PI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
>> +EXT4_RW_ATTR_SBI_PI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order);
>>   #ifdef CONFIG_EXT4_DEBUG
>>   EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
>>   #endif
>> @@ -376,6 +380,7 @@ static ssize_t ext4_generic_attr_show(struct ext4_attr *a,
>>   
>>   	switch (a->attr_id) {
>>   	case attr_inode_readahead:
>> +	case attr_pointer_pi:
>>   	case attr_pointer_ui:
>>   		if (a->attr_ptr == ptr_ext4_super_block_offset)
>>   			return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr));
>> @@ -448,6 +453,10 @@ static ssize_t ext4_generic_attr_store(struct ext4_attr *a,
>>   		return ret;
>>   
>>   	switch (a->attr_id) {
>> +	case attr_pointer_pi:
>> +		if ((int)t < 0)
>> +			return -EINVAL;
>> +		fallthrough;
>>   	case attr_pointer_ui:
>>   		if (t != (unsigned int)t)
>>   			return -EINVAL;
>> -- 
>> 2.31.1
>>


Thanks!
Jan Kara Feb. 23, 2024, 12:05 p.m. UTC | #4
On Sat 17-02-24 15:41:43, Baokun Li wrote:
> On 2024/2/14 0:58, Jan Kara wrote:
> > On Fri 26-01-24 16:57:13, Baokun Li wrote:
> > > We can easily trigger a BUG_ON by using the following commands:
> > > 
> > >      mount /dev/$disk /tmp/test
> > >      echo 2147483650 > /sys/fs/ext4/$disk/mb_group_prealloc
> > >      echo test > /tmp/test/file && sync
> > > 
> > > ==================================================================
> > > kernel BUG at fs/ext4/mballoc.c:2029!
> > > invalid opcode: 0000 [#1] PREEMPT SMP PTI
> > > CPU: 3 PID: 320 Comm: kworker/u36:1 Not tainted 6.8.0-rc1 #462
> > > RIP: 0010:mb_mark_used+0x358/0x370
> > > [...]
> > > Call Trace:
> > >   ext4_mb_use_best_found+0x56/0x140
> > >   ext4_mb_complex_scan_group+0x196/0x2f0
> > >   ext4_mb_regular_allocator+0xa92/0xf00
> > >   ext4_mb_new_blocks+0x302/0xbc0
> > >   ext4_ext_map_blocks+0x95a/0xef0
> > >   ext4_map_blocks+0x2b1/0x680
> > >   ext4_do_writepages+0x733/0xbd0
> > > [...]
> > > ==================================================================
> > > 
> > > In ext4_mb_normalize_group_request():
> > >      ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
> > > 
> > > Here fe_len is of type int, but s_mb_group_prealloc is of type unsigned
> > > int, so setting s_mb_group_prealloc to 2147483650 overflows fe_len to a
> > > negative number, which ultimately triggers a BUG_ON() in mb_mark_used().
> > > 
> > > Therefore, we add attr_pointer_pi (aka positive int attr pointer) with a
> > > value range of 0-INT_MAX to avoid the above problem. In addition to the
> > > mb_group_prealloc sysfs interface, the following interfaces also have uint
> > > to int conversions that result in overflows, and are also fixed.
> > > 
> > >    err_ratelimit_burst
> > >    msg_ratelimit_burst
> > >    warning_ratelimit_burst
> > >    err_ratelimit_interval_ms
> > >    msg_ratelimit_interval_ms
> > >    warning_ratelimit_interval_ms
> > >    mb_best_avail_max_trim_order
> > > 
> > > CC: stable@vger.kernel.org
> > > Signed-off-by: Baokun Li <libaokun1@huawei.com>
> > I don't think you need to change s_mb_group_prealloc here and then restrict
> > it even further in the next patch. I'd just leave it alone here.
> Yes, we could put the next patch before this one, but using
> s_mb_group_prealloc as an example makes it easier to understand
> why the attr_pointer_pi case is added here.There are several other
> variables that don't have more convincing examples.

Yes, I think reordering would be good. Because I've read the convertion and
started wondering: "is this enough?"

> > Also I think that limiting mb_best_avail_max_trim_order to 64 instead of
> > INT_MAX will make us more resilient to surprises in the future :) But I
> > don't really insist.
> > 
> > 								Honza
> I think it's enough here to make sure that mb_best_avail_max_trim_order
> is a positive number, since we always make sure that min_order
> is not less than 0, as follows:
> 
>          order = fls(ac->ac_g_ex.fe_len) - 1;
>          min_order = order - sbi->s_mb_best_avail_max_trim_order;
>          if (min_order < 0)
>                  min_order = 0;
> 
> An oversized mb_best_avail_max_trim_order can be interpreted as
> always being CR_ANY_FREE. 😄

Well, s_mb_best_avail_max_trim_order is not about allocation passes but
about how many times are we willing to shorten the goal extent to half and
still use the advanced free blocks search. And I agree that the mballoc
code is careful enough that large numbers don't matter there but still why
allowing storing garbage values? It is nicer to tell sysadmin he did
something wrong right away.

								Honza
Baokun Li Feb. 24, 2024, 2:46 a.m. UTC | #5
On 2024/2/23 20:05, Jan Kara wrote:
> On Sat 17-02-24 15:41:43, Baokun Li wrote:
>> On 2024/2/14 0:58, Jan Kara wrote:
>>> On Fri 26-01-24 16:57:13, Baokun Li wrote:
>>>> We can easily trigger a BUG_ON by using the following commands:
>>>>
>>>>       mount /dev/$disk /tmp/test
>>>>       echo 2147483650 > /sys/fs/ext4/$disk/mb_group_prealloc
>>>>       echo test > /tmp/test/file && sync
>>>>
>>>> ==================================================================
>>>> kernel BUG at fs/ext4/mballoc.c:2029!
>>>> invalid opcode: 0000 [#1] PREEMPT SMP PTI
>>>> CPU: 3 PID: 320 Comm: kworker/u36:1 Not tainted 6.8.0-rc1 #462
>>>> RIP: 0010:mb_mark_used+0x358/0x370
>>>> [...]
>>>> Call Trace:
>>>>    ext4_mb_use_best_found+0x56/0x140
>>>>    ext4_mb_complex_scan_group+0x196/0x2f0
>>>>    ext4_mb_regular_allocator+0xa92/0xf00
>>>>    ext4_mb_new_blocks+0x302/0xbc0
>>>>    ext4_ext_map_blocks+0x95a/0xef0
>>>>    ext4_map_blocks+0x2b1/0x680
>>>>    ext4_do_writepages+0x733/0xbd0
>>>> [...]
>>>> ==================================================================
>>>>
>>>> In ext4_mb_normalize_group_request():
>>>>       ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
>>>>
>>>> Here fe_len is of type int, but s_mb_group_prealloc is of type unsigned
>>>> int, so setting s_mb_group_prealloc to 2147483650 overflows fe_len to a
>>>> negative number, which ultimately triggers a BUG_ON() in mb_mark_used().
>>>>
>>>> Therefore, we add attr_pointer_pi (aka positive int attr pointer) with a
>>>> value range of 0-INT_MAX to avoid the above problem. In addition to the
>>>> mb_group_prealloc sysfs interface, the following interfaces also have uint
>>>> to int conversions that result in overflows, and are also fixed.
>>>>
>>>>     err_ratelimit_burst
>>>>     msg_ratelimit_burst
>>>>     warning_ratelimit_burst
>>>>     err_ratelimit_interval_ms
>>>>     msg_ratelimit_interval_ms
>>>>     warning_ratelimit_interval_ms
>>>>     mb_best_avail_max_trim_order
>>>>
>>>> CC: stable@vger.kernel.org
>>>> Signed-off-by: Baokun Li <libaokun1@huawei.com>
>>> I don't think you need to change s_mb_group_prealloc here and then restrict
>>> it even further in the next patch. I'd just leave it alone here.
>> Yes, we could put the next patch before this one, but using
>> s_mb_group_prealloc as an example makes it easier to understand
>> why the attr_pointer_pi case is added here.There are several other
>> variables that don't have more convincing examples.
> Yes, I think reordering would be good. Because I've read the convertion and
> started wondering: "is this enough?"
Well, I will put the next patch before this one in the next version.
>>> Also I think that limiting mb_best_avail_max_trim_order to 64 instead of
>>> INT_MAX will make us more resilient to surprises in the future :) But I
>>> don't really insist.
>>>
>>> 								Honza
>> I think it's enough here to make sure that mb_best_avail_max_trim_order
>> is a positive number, since we always make sure that min_order
>> is not less than 0, as follows:
>>
>>           order = fls(ac->ac_g_ex.fe_len) - 1;
>>           min_order = order - sbi->s_mb_best_avail_max_trim_order;
>>           if (min_order < 0)
>>                   min_order = 0;
>>
>> An oversized mb_best_avail_max_trim_order can be interpreted as
>> always being CR_ANY_FREE. 😄
> Well, s_mb_best_avail_max_trim_order is not about allocation passes but
> about how many times are we willing to shorten the goal extent to half and
> still use the advanced free blocks search.
Yes, this means that in CR1.5, in case the original request is satisfied,
we allow allocation of blocks with an order of (goal_extent_order -
s_mb_best_avail_max_trim_order) to accelerate block allocation.
> And I agree that the mballoc
> code is careful enough that large numbers don't matter there but still why
> allowing storing garbage values? It is nicer to tell sysadmin he did
> something wrong right away.
>
> 								Honza
Yes, we shouldn't allow storing rubbish values, otherwise it may
mislead admins, I will add an extra type to check it.


Thanks!
diff mbox series

Patch

diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index a5d657fa05cb..6f9f96e00f2f 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -30,6 +30,7 @@  typedef enum {
 	attr_first_error_time,
 	attr_last_error_time,
 	attr_feature,
+	attr_pointer_pi,
 	attr_pointer_ui,
 	attr_pointer_ul,
 	attr_pointer_u64,
@@ -178,6 +179,9 @@  static struct ext4_attr ext4_attr_##_name = {			\
 #define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size)			\
 	EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname)
 
+#define EXT4_RW_ATTR_SBI_PI(_name,_elname)      \
+	EXT4_ATTR_OFFSET(_name, 0644, pointer_pi, ext4_sb_info, _elname)
+
 #define EXT4_RW_ATTR_SBI_UI(_name,_elname)	\
 	EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)
 
@@ -213,17 +217,17 @@  EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
-EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_PI(mb_group_prealloc, s_mb_group_prealloc);
 EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups);
 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
 EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
-EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order);
+EXT4_RW_ATTR_SBI_PI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_PI(err_ratelimit_burst, s_err_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_PI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_PI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_PI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_PI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_PI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order);
 #ifdef CONFIG_EXT4_DEBUG
 EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
 #endif
@@ -376,6 +380,7 @@  static ssize_t ext4_generic_attr_show(struct ext4_attr *a,
 
 	switch (a->attr_id) {
 	case attr_inode_readahead:
+	case attr_pointer_pi:
 	case attr_pointer_ui:
 		if (a->attr_ptr == ptr_ext4_super_block_offset)
 			return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr));
@@ -448,6 +453,10 @@  static ssize_t ext4_generic_attr_store(struct ext4_attr *a,
 		return ret;
 
 	switch (a->attr_id) {
+	case attr_pointer_pi:
+		if ((int)t < 0)
+			return -EINVAL;
+		fallthrough;
 	case attr_pointer_ui:
 		if (t != (unsigned int)t)
 			return -EINVAL;