diff mbox series

[v3,4/9] ext4: fix slab-out-of-bounds in ext4_mb_find_good_group_avg_frag_lists()

Message ID 20240314140906.3064072-5-libaokun1@huawei.com
State Superseded
Headers show
Series ext4: avoid sysfs variables overflow causing BUG_ON/SOOB | expand

Commit Message

Baokun Li March 14, 2024, 2:09 p.m. UTC
We can trigger a slab-out-of-bounds with the following commands:

    mkfs.ext4 -F /dev/$disk 10G
    mount /dev/$disk /tmp/test
    echo 2147483647 > /sys/fs/ext4/$disk/mb_group_prealloc
    echo test > /tmp/test/file && sync

==================================================================
BUG: KASAN: slab-out-of-bounds in ext4_mb_find_good_group_avg_frag_lists+0x8a/0x200 [ext4]
Read of size 8 at addr ffff888121b9d0f0 by task kworker/u2:0/11
CPU: 0 PID: 11 Comm: kworker/u2:0 Tainted: GL 6.7.0-next-20240118 #521
Call Trace:
 dump_stack_lvl+0x2c/0x50
 kasan_report+0xb6/0xf0
 ext4_mb_find_good_group_avg_frag_lists+0x8a/0x200 [ext4]
 ext4_mb_regular_allocator+0x19e9/0x2370 [ext4]
 ext4_mb_new_blocks+0x88a/0x1370 [ext4]
 ext4_ext_map_blocks+0x14f7/0x2390 [ext4]
 ext4_map_blocks+0x569/0xea0 [ext4]
 ext4_do_writepages+0x10f6/0x1bc0 [ext4]
[...]
==================================================================

The flow of issue triggering is as follows:

// Set s_mb_group_prealloc to 2147483647 via sysfs
ext4_mb_new_blocks
  ext4_mb_normalize_request
    ext4_mb_normalize_group_request
      ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc
  ext4_mb_regular_allocator
    ext4_mb_choose_next_group
      ext4_mb_choose_next_group_best_avail
        mb_avg_fragment_size_order
          order = fls(len) - 2 = 29
        ext4_mb_find_good_group_avg_frag_lists
          frag_list = &sbi->s_mb_avg_fragment_size[order]
          if (list_empty(frag_list)) // Trigger SOOB!

At 4k block size, the length of the s_mb_avg_fragment_size list is 14,
but an oversized s_mb_group_prealloc is set, causing slab-out-of-bounds
to be triggered by an attempt to access an element at index 29.

Add a new attr_id attr_clusters_in_group with values in the range
[0, sbi->s_clusters_per_group] and declare mb_group_prealloc as
that type to fix the issue. In addition avoid returning an order
from mb_avg_fragment_size_order() greater than MB_NUM_ORDERS(sb)
and reduce some useless loops.

Fixes: 7e170922f06b ("ext4: Add allocation criteria 1.5 (CR1_5)")
CC: stable@vger.kernel.org
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/mballoc.c |  4 ++++
 fs/ext4/sysfs.c   | 13 ++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

Comments

Ojaswin Mujoo March 18, 2024, 12:39 p.m. UTC | #1
On Thu, Mar 14, 2024 at 10:09:01PM +0800, Baokun Li wrote:
> We can trigger a slab-out-of-bounds with the following commands:
> 
>     mkfs.ext4 -F /dev/$disk 10G
>     mount /dev/$disk /tmp/test
>     echo 2147483647 > /sys/fs/ext4/$disk/mb_group_prealloc
>     echo test > /tmp/test/file && sync
> 
> ==================================================================
> BUG: KASAN: slab-out-of-bounds in ext4_mb_find_good_group_avg_frag_lists+0x8a/0x200 [ext4]
> Read of size 8 at addr ffff888121b9d0f0 by task kworker/u2:0/11
> CPU: 0 PID: 11 Comm: kworker/u2:0 Tainted: GL 6.7.0-next-20240118 #521
> Call Trace:
>  dump_stack_lvl+0x2c/0x50
>  kasan_report+0xb6/0xf0
>  ext4_mb_find_good_group_avg_frag_lists+0x8a/0x200 [ext4]
>  ext4_mb_regular_allocator+0x19e9/0x2370 [ext4]
>  ext4_mb_new_blocks+0x88a/0x1370 [ext4]
>  ext4_ext_map_blocks+0x14f7/0x2390 [ext4]
>  ext4_map_blocks+0x569/0xea0 [ext4]
>  ext4_do_writepages+0x10f6/0x1bc0 [ext4]
> [...]
> ==================================================================
> 
> The flow of issue triggering is as follows:
> 
> // Set s_mb_group_prealloc to 2147483647 via sysfs
> ext4_mb_new_blocks
>   ext4_mb_normalize_request
>     ext4_mb_normalize_group_request
>       ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc
>   ext4_mb_regular_allocator
>     ext4_mb_choose_next_group
>       ext4_mb_choose_next_group_best_avail
>         mb_avg_fragment_size_order
>           order = fls(len) - 2 = 29
>         ext4_mb_find_good_group_avg_frag_lists
>           frag_list = &sbi->s_mb_avg_fragment_size[order]
>           if (list_empty(frag_list)) // Trigger SOOB!
> 
> At 4k block size, the length of the s_mb_avg_fragment_size list is 14,
> but an oversized s_mb_group_prealloc is set, causing slab-out-of-bounds
> to be triggered by an attempt to access an element at index 29.
> 
> Add a new attr_id attr_clusters_in_group with values in the range
> [0, sbi->s_clusters_per_group] and declare mb_group_prealloc as
> that type to fix the issue. In addition avoid returning an order
> from mb_avg_fragment_size_order() greater than MB_NUM_ORDERS(sb)
> and reduce some useless loops.
> 
> Fixes: 7e170922f06b ("ext4: Add allocation criteria 1.5 (CR1_5)")
> CC: stable@vger.kernel.org
> Signed-off-by: Baokun Li <libaokun1@huawei.com>
> Reviewed-by: Jan Kara <jack@suse.cz>
> ---
>  fs/ext4/mballoc.c |  4 ++++
>  fs/ext4/sysfs.c   | 13 ++++++++++++-
>  2 files changed, 16 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 12b3f196010b..48afe5aa228c 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -831,6 +831,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
>     return 0;
>   if (order == MB_NUM_ORDERS(sb))
>     order--;
> + if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
> +   order = MB_NUM_ORDERS(sb) - 1;

Hey Baokun,

Thanks for fixing this. This patch looks good to me, feel free to add:

Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>

my comments after this are less about the patch and more about some
thoughts on the working of average fragment lists.

So going through the v2 and this patch got me thinking about what really
is going to happen when a user tries to allocate 32768 blocks which is also 
the maximum value we could have in say ac->ac_g_ex.fe_len.

When this happens, ext4_mb_regular_allocator() will directly set the
criteria as CR_GOAL_LEN_FAST. Now, we'll follow:

ext4_mb_choose_next_group_goal_fast()
  for (i=mb_avg_fragment_size_order(); i < MB_NUM_ORDERS; i++) { .. }

Here, mb_avg_fragment_siz_order() will do something like:

  order = fls(32768) - 2 = 14
  ...
  if (order == MB_NUM_ORDERS(sb))
    order--;

  return order;

And we'll look in the fragment list[13] and since none of the groups
there would have 32768 blocks free (since we dont track it here) we'll
unnecessarily traverse the full list before falling to CR_BEST_AVAIL_LEN
(this will become a noop due to the way order and min_order
are calculated) and eventually to CR_GOAL_LEN_SLOW where we might get
something or end up splitting.

I think something more optimal would be to:

1. Add another entry to average fragment lists for completely empty
groups. (As a sidenote i think we should use something like MB_NUM_FRAG_ORDER
instead of MB_NUM_ORDERS in calculating limits related to average
fragment lists since the NUM_ORDERS seems to be the buddy max order ie
8192 blocks only valid for CR_POWER2 and shouldn't really limit the
fragment size lists)

2. If we don't want to go with 1 (maybe there's some history for that),
then probably should exit early from CR_GOAL_LEN_FAST so that we don't
iterate there.

Would like to hear your thoughts on it Baokun, Jan. 

Regards,
ojaswin

>   return order;
>  }
>  
> @@ -1008,6 +1010,8 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
>    * goal length.
>    */
>   order = fls(ac->ac_g_ex.fe_len) - 1;
> + if (WARN_ON_ONCE(order > MB_NUM_ORDERS(ac->ac_sb) - 1))
> +   order = MB_NUM_ORDERS(ac->ac_sb) - 1;
>   min_order = order - sbi->s_mb_best_avail_max_trim_order;
>   if (min_order < 0)
>     min_order = 0;
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index 7f455b5f22c0..ddd71673176c 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -29,6 +29,7 @@ typedef enum {
>   attr_trigger_test_error,
>   attr_first_error_time,
>   attr_last_error_time,
> + attr_clusters_in_group,
>   attr_feature,
>   attr_pointer_ui,
>   attr_pointer_ul,
> @@ -207,13 +208,14 @@ EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444);
>  
>  EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
>      ext4_sb_info, s_inode_readahead_blks);
> +EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group,
> +    ext4_sb_info, s_mb_group_prealloc);
>  EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
>  EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
>  EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
>  EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
>  EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
>  EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
> -EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
>  EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups);
>  EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
>  EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
> @@ -376,6 +378,7 @@ static ssize_t ext4_generic_attr_show(struct ext4_attr *a,
>  
>   switch (a->attr_id) {
>   case attr_inode_readahead:
> + case attr_clusters_in_group:
>   case attr_pointer_ui:
>     if (a->attr_ptr == ptr_ext4_super_block_offset)
>       return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr));
> @@ -455,6 +458,14 @@ static ssize_t ext4_generic_attr_store(struct ext4_attr *a,
>     else
>       *((unsigned int *) ptr) = t;
>     return len;
> + case attr_clusters_in_group:
> +   ret = kstrtouint(skip_spaces(buf), 0, &t);
> +   if (ret)
> +     return ret;
> +   if (t > sbi->s_clusters_per_group)
> +     return -EINVAL;
> +   *((unsigned int *) ptr) = t;
> +   return len;
>   case attr_pointer_ul:
>     ret = kstrtoul(skip_spaces(buf), 0, &lt);
>     if (ret)
> -- 
> 2.31.1
>
Jan Kara March 18, 2024, 3:25 p.m. UTC | #2
On Mon 18-03-24 18:09:18, Ojaswin Mujoo wrote:
> On Thu, Mar 14, 2024 at 10:09:01PM +0800, Baokun Li wrote:
> > We can trigger a slab-out-of-bounds with the following commands:
> > 
> >     mkfs.ext4 -F /dev/$disk 10G
> >     mount /dev/$disk /tmp/test
> >     echo 2147483647 > /sys/fs/ext4/$disk/mb_group_prealloc
> >     echo test > /tmp/test/file && sync
> > 
> > ==================================================================
> > BUG: KASAN: slab-out-of-bounds in ext4_mb_find_good_group_avg_frag_lists+0x8a/0x200 [ext4]
> > Read of size 8 at addr ffff888121b9d0f0 by task kworker/u2:0/11
> > CPU: 0 PID: 11 Comm: kworker/u2:0 Tainted: GL 6.7.0-next-20240118 #521
> > Call Trace:
> >  dump_stack_lvl+0x2c/0x50
> >  kasan_report+0xb6/0xf0
> >  ext4_mb_find_good_group_avg_frag_lists+0x8a/0x200 [ext4]
> >  ext4_mb_regular_allocator+0x19e9/0x2370 [ext4]
> >  ext4_mb_new_blocks+0x88a/0x1370 [ext4]
> >  ext4_ext_map_blocks+0x14f7/0x2390 [ext4]
> >  ext4_map_blocks+0x569/0xea0 [ext4]
> >  ext4_do_writepages+0x10f6/0x1bc0 [ext4]
> > [...]
> > ==================================================================
> > 
> > The flow of issue triggering is as follows:
> > 
> > // Set s_mb_group_prealloc to 2147483647 via sysfs
> > ext4_mb_new_blocks
> >   ext4_mb_normalize_request
> >     ext4_mb_normalize_group_request
> >       ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc
> >   ext4_mb_regular_allocator
> >     ext4_mb_choose_next_group
> >       ext4_mb_choose_next_group_best_avail
> >         mb_avg_fragment_size_order
> >           order = fls(len) - 2 = 29
> >         ext4_mb_find_good_group_avg_frag_lists
> >           frag_list = &sbi->s_mb_avg_fragment_size[order]
> >           if (list_empty(frag_list)) // Trigger SOOB!
> > 
> > At 4k block size, the length of the s_mb_avg_fragment_size list is 14,
> > but an oversized s_mb_group_prealloc is set, causing slab-out-of-bounds
> > to be triggered by an attempt to access an element at index 29.
> > 
> > Add a new attr_id attr_clusters_in_group with values in the range
> > [0, sbi->s_clusters_per_group] and declare mb_group_prealloc as
> > that type to fix the issue. In addition avoid returning an order
> > from mb_avg_fragment_size_order() greater than MB_NUM_ORDERS(sb)
> > and reduce some useless loops.
> > 
> > Fixes: 7e170922f06b ("ext4: Add allocation criteria 1.5 (CR1_5)")
> > CC: stable@vger.kernel.org
> > Signed-off-by: Baokun Li <libaokun1@huawei.com>
> > Reviewed-by: Jan Kara <jack@suse.cz>
> > ---
> >  fs/ext4/mballoc.c |  4 ++++
> >  fs/ext4/sysfs.c   | 13 ++++++++++++-
> >  2 files changed, 16 insertions(+), 1 deletion(-)
> > 
> > diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> > index 12b3f196010b..48afe5aa228c 100644
> > --- a/fs/ext4/mballoc.c
> > +++ b/fs/ext4/mballoc.c
> > @@ -831,6 +831,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
> >     return 0;
> >   if (order == MB_NUM_ORDERS(sb))
> >     order--;
> > + if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
> > +   order = MB_NUM_ORDERS(sb) - 1;
> 
> Hey Baokun,
> 
> Thanks for fixing this. This patch looks good to me, feel free to add:
> 
> Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> 
> my comments after this are less about the patch and more about some
> thoughts on the working of average fragment lists.
> 
> So going through the v2 and this patch got me thinking about what really
> is going to happen when a user tries to allocate 32768 blocks which is also 
> the maximum value we could have in say ac->ac_g_ex.fe_len.
> 
> When this happens, ext4_mb_regular_allocator() will directly set the
> criteria as CR_GOAL_LEN_FAST. Now, we'll follow:
> 
> ext4_mb_choose_next_group_goal_fast()
>   for (i=mb_avg_fragment_size_order(); i < MB_NUM_ORDERS; i++) { .. }
> 
> Here, mb_avg_fragment_siz_order() will do something like:
> 
>   order = fls(32768) - 2 = 14
>   ...
>   if (order == MB_NUM_ORDERS(sb))
>     order--;
> 
>   return order;
> 
> And we'll look in the fragment list[13] and since none of the groups
> there would have 32768 blocks free (since we dont track it here) we'll
> unnecessarily traverse the full list before falling to CR_BEST_AVAIL_LEN
> (this will become a noop due to the way order and min_order
> are calculated) and eventually to CR_GOAL_LEN_SLOW where we might get
> something or end up splitting.

Yeah, agreed this looks a bit suboptimal. I'm just not 100% sure whether
we'll ever generate a request to allocate 32768 blocks - that would need
verification with tracing - because I have some vague recollection I once
arrived at conclusion this is not possible.

> I think something more optimal would be to:
> 
> 1. Add another entry to average fragment lists for completely empty
> groups. (As a sidenote i think we should use something like MB_NUM_FRAG_ORDER
> instead of MB_NUM_ORDERS in calculating limits related to average
> fragment lists since the NUM_ORDERS seems to be the buddy max order ie
> 8192 blocks only valid for CR_POWER2 and shouldn't really limit the
> fragment size lists)

I guess the thinking was that you can never get larger than
1<<(MB_NUM_ORDERS-1) chunk from mballoc so there's no point to keep
fragment lists of such chunks?

								Honza
Baokun Li March 19, 2024, 10:05 a.m. UTC | #3
On 2024/3/18 20:39, Ojaswin Mujoo wrote:
> On Thu, Mar 14, 2024 at 10:09:01PM +0800, Baokun Li wrote:
>> --- a/fs/ext4/mballoc.c
>> +++ b/fs/ext4/mballoc.c
>> @@ -831,6 +831,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
>>      return 0;
>>    if (order == MB_NUM_ORDERS(sb))
>>      order--;
>> + if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
>> +   order = MB_NUM_ORDERS(sb) - 1;
> Hey Baokun,
Hi Ojaswin,
>
> Thanks for fixing this. This patch looks good to me, feel free to add:
>
> Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Thanks for the review!
> my comments after this are less about the patch and more about some
> thoughts on the working of average fragment lists.
>
> So going through the v2 and this patch got me thinking about what really
> is going to happen when a user tries to allocate 32768 blocks which is also
> the maximum value we could have in say ac->ac_g_ex.fe_len.
>
> When this happens, ext4_mb_regular_allocator() will directly set the
> criteria as CR_GOAL_LEN_FAST. Now, we'll follow:
>
> ext4_mb_choose_next_group_goal_fast()
>    for (i=mb_avg_fragment_size_order(); i < MB_NUM_ORDERS; i++) { .. }
>
> Here, mb_avg_fragment_siz_order() will do something like:
>
>    order = fls(32768) - 2 = 14
>    ...
>    if (order == MB_NUM_ORDERS(sb))
>      order--;
>
>    return order;
>
> And we'll look in the fragment list[13] and since none of the groups
> there would have 32768 blocks free (since we dont track it here) we'll
> unnecessarily traverse the full list before falling to CR_BEST_AVAIL_LEN
> (this will become a noop due to the way order and min_order
> are calculated) and eventually to CR_GOAL_LEN_SLOW where we might get
> something or end up splitting.
That's not quite right, in ext4_mb_choose_next_group_goal_fast() even
though we're looking for the group with order 13, the group with 32768
free blocks is also in there. So after passing ext4_mb_good_group() in
ext4_mb_find_good_group_avg_frag_lists(), we get a group with 32768
free blocks. And in ext4_mb_choose_next_group_best_avail() we were
supposed to allocate blocks quickly by trim order, so it's necessary
here too. So there are no unnecessary loops here.

But this will trigger the freshly added WARN_ON_ONCE, so in the
new iteration I need to change it to:

if (WARN_ON_ONCE(order > MB_NUM_ORDERS(ac->ac_sb) + 1))
         order = MB_NUM_ORDERS(ac->ac_sb) - 1;

In addition, when the block size is 4k, there are these limitations:

1) Limit the maximum size of the data allocation estimate to 8M in
     ext4_mb_normalize_request().
2) #define MAX_WRITEPAGES_EXTENT_LEN 2048
3) #define DIO_MAX_BLOCKS 4096
4) Metadata is generally not allocated in many blocks at a time

So it seems that only group_prealloc will allocate more than 2048
blocks at a time.

And I've tried removing those 8M/2048/4096 limits before, but the
performance of DIO write barely changed, and it doesn't look like
the performance bottleneck is here in the number of blocks allocated
at a time at the moment.

Thanks,
Baokun
> I think something more optimal would be to:
>
> 1. Add another entry to average fragment lists for completely empty
> groups. (As a sidenote i think we should use something like MB_NUM_FRAG_ORDER
> instead of MB_NUM_ORDERS in calculating limits related to average
> fragment lists since the NUM_ORDERS seems to be the buddy max order ie
> 8192 blocks only valid for CR_POWER2 and shouldn't really limit the
> fragment size lists)
>
> 2. If we don't want to go with 1 (maybe there's some history for that),
> then probably should exit early from CR_GOAL_LEN_FAST so that we don't
> iterate there.
>
> Would like to hear your thoughts on it Baokun, Jan.
>
> Regards,
> ojaswin
>
Ojaswin Mujoo March 19, 2024, 10:36 a.m. UTC | #4
On Mon, Mar 18, 2024 at 04:25:09PM +0100, Jan Kara wrote:
> On Mon 18-03-24 18:09:18, Ojaswin Mujoo wrote:
> > On Thu, Mar 14, 2024 at 10:09:01PM +0800, Baokun Li wrote:
> > > We can trigger a slab-out-of-bounds with the following commands:
> > > 
> > >     mkfs.ext4 -F /dev/$disk 10G
> > >     mount /dev/$disk /tmp/test
> > >     echo 2147483647 > /sys/fs/ext4/$disk/mb_group_prealloc
> > >     echo test > /tmp/test/file && sync
> > > 
> > > ==================================================================
> > > BUG: KASAN: slab-out-of-bounds in ext4_mb_find_good_group_avg_frag_lists+0x8a/0x200 [ext4]
> > > Read of size 8 at addr ffff888121b9d0f0 by task kworker/u2:0/11
> > > CPU: 0 PID: 11 Comm: kworker/u2:0 Tainted: GL 6.7.0-next-20240118 #521
> > > Call Trace:
> > >  dump_stack_lvl+0x2c/0x50
> > >  kasan_report+0xb6/0xf0
> > >  ext4_mb_find_good_group_avg_frag_lists+0x8a/0x200 [ext4]
> > >  ext4_mb_regular_allocator+0x19e9/0x2370 [ext4]
> > >  ext4_mb_new_blocks+0x88a/0x1370 [ext4]
> > >  ext4_ext_map_blocks+0x14f7/0x2390 [ext4]
> > >  ext4_map_blocks+0x569/0xea0 [ext4]
> > >  ext4_do_writepages+0x10f6/0x1bc0 [ext4]
> > > [...]
> > > ==================================================================
> > > 
> > > The flow of issue triggering is as follows:
> > > 
> > > // Set s_mb_group_prealloc to 2147483647 via sysfs
> > > ext4_mb_new_blocks
> > >   ext4_mb_normalize_request
> > >     ext4_mb_normalize_group_request
> > >       ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc
> > >   ext4_mb_regular_allocator
> > >     ext4_mb_choose_next_group
> > >       ext4_mb_choose_next_group_best_avail
> > >         mb_avg_fragment_size_order
> > >           order = fls(len) - 2 = 29
> > >         ext4_mb_find_good_group_avg_frag_lists
> > >           frag_list = &sbi->s_mb_avg_fragment_size[order]
> > >           if (list_empty(frag_list)) // Trigger SOOB!
> > > 
> > > At 4k block size, the length of the s_mb_avg_fragment_size list is 14,
> > > but an oversized s_mb_group_prealloc is set, causing slab-out-of-bounds
> > > to be triggered by an attempt to access an element at index 29.
> > > 
> > > Add a new attr_id attr_clusters_in_group with values in the range
> > > [0, sbi->s_clusters_per_group] and declare mb_group_prealloc as
> > > that type to fix the issue. In addition avoid returning an order
> > > from mb_avg_fragment_size_order() greater than MB_NUM_ORDERS(sb)
> > > and reduce some useless loops.
> > > 
> > > Fixes: 7e170922f06b ("ext4: Add allocation criteria 1.5 (CR1_5)")
> > > CC: stable@vger.kernel.org
> > > Signed-off-by: Baokun Li <libaokun1@huawei.com>
> > > Reviewed-by: Jan Kara <jack@suse.cz>
> > > ---
> > >  fs/ext4/mballoc.c |  4 ++++
> > >  fs/ext4/sysfs.c   | 13 ++++++++++++-
> > >  2 files changed, 16 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> > > index 12b3f196010b..48afe5aa228c 100644
> > > --- a/fs/ext4/mballoc.c
> > > +++ b/fs/ext4/mballoc.c
> > > @@ -831,6 +831,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
> > >     return 0;
> > >   if (order == MB_NUM_ORDERS(sb))
> > >     order--;
> > > + if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
> > > +   order = MB_NUM_ORDERS(sb) - 1;
> > 
> > Hey Baokun,
> > 
> > Thanks for fixing this. This patch looks good to me, feel free to add:
> > 
> > Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> > 
> > my comments after this are less about the patch and more about some
> > thoughts on the working of average fragment lists.
> > 
> > So going through the v2 and this patch got me thinking about what really
> > is going to happen when a user tries to allocate 32768 blocks which is also 
> > the maximum value we could have in say ac->ac_g_ex.fe_len.
> > 
> > When this happens, ext4_mb_regular_allocator() will directly set the
> > criteria as CR_GOAL_LEN_FAST. Now, we'll follow:
> > 
> > ext4_mb_choose_next_group_goal_fast()
> >   for (i=mb_avg_fragment_size_order(); i < MB_NUM_ORDERS; i++) { .. }
> > 
> > Here, mb_avg_fragment_siz_order() will do something like:
> > 
> >   order = fls(32768) - 2 = 14
> >   ...
> >   if (order == MB_NUM_ORDERS(sb))
> >     order--;
> > 
> >   return order;
> > 
> > And we'll look in the fragment list[13] and since none of the groups
> > there would have 32768 blocks free (since we dont track it here) we'll
> > unnecessarily traverse the full list before falling to CR_BEST_AVAIL_LEN
> > (this will become a noop due to the way order and min_order
> > are calculated) and eventually to CR_GOAL_LEN_SLOW where we might get
> > something or end up splitting.
> 
> Yeah, agreed this looks a bit suboptimal. I'm just not 100% sure whether
> we'll ever generate a request to allocate 32768 blocks - that would need
> verification with tracing - because I have some vague recollection I once
> arrived at conclusion this is not possible.

Ahh, right! I see the following line in mpage_add_bh_to_extent():

	/* Don't go larger than mballoc is willing to allocate */
	if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
		return false;

Where MAX_WRITEPAGES_EXTENT_LEN is 2048 ie 8MB on 4k filesystem. As pointed out
by your comment there, it seems to come from the fact that we have some restrictions 
in ext4_mb_normalize_range() which don't allow us to go beyond 8MB length. I think
I was also looking at that code sometime back and it really needs some rework, I'll 
try to test a few things out.

So yep, in the usual paths we shouldn't be sending a request as big as 32768 blocks
but it's still possible with group prealloc with s_mb_group_prealloc set to 32768.

Anyways, thanks for pointing this out, I'll try to look into the code path more to see
how we can optimize it better and maybe if we can lift the 2048 block restriction.

Regards,
ojaswin

> 
> > I think something more optimal would be to:
> > 
> > 1. Add another entry to average fragment lists for completely empty
> > groups. (As a sidenote i think we should use something like MB_NUM_FRAG_ORDER
> > instead of MB_NUM_ORDERS in calculating limits related to average
> > fragment lists since the NUM_ORDERS seems to be the buddy max order ie
> > 8192 blocks only valid for CR_POWER2 and shouldn't really limit the
> > fragment size lists)
> 
> I guess the thinking was that you can never get larger than
> 1<<(MB_NUM_ORDERS-1) chunk from mballoc so there's no point to keep
> fragment lists of such chunks?
> 
> 								Honza
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
Ojaswin Mujoo March 19, 2024, 6:25 p.m. UTC | #5
On Tue, Mar 19, 2024 at 06:05:53PM +0800, Baokun Li wrote:
> On 2024/3/18 20:39, Ojaswin Mujoo wrote:
> > On Thu, Mar 14, 2024 at 10:09:01PM +0800, Baokun Li wrote:
> > > --- a/fs/ext4/mballoc.c
> > > +++ b/fs/ext4/mballoc.c
> > > @@ -831,6 +831,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
> > >      return 0;
> > >    if (order == MB_NUM_ORDERS(sb))
> > >      order--;
> > > + if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
> > > +   order = MB_NUM_ORDERS(sb) - 1;
> > Hey Baokun,
> Hi Ojaswin,
> > 
> > Thanks for fixing this. This patch looks good to me, feel free to add:
> > 
> > Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> Thanks for the review!
> > my comments after this are less about the patch and more about some
> > thoughts on the working of average fragment lists.
> > 
> > So going through the v2 and this patch got me thinking about what really
> > is going to happen when a user tries to allocate 32768 blocks which is also
> > the maximum value we could have in say ac->ac_g_ex.fe_len.
> > 
> > When this happens, ext4_mb_regular_allocator() will directly set the
> > criteria as CR_GOAL_LEN_FAST. Now, we'll follow:
> > 
> > ext4_mb_choose_next_group_goal_fast()
> >    for (i=mb_avg_fragment_size_order(); i < MB_NUM_ORDERS; i++) { .. }
> > 
> > Here, mb_avg_fragment_siz_order() will do something like:
> > 
> >    order = fls(32768) - 2 = 14
> >    ...
> >    if (order == MB_NUM_ORDERS(sb))
> >      order--;
> > 
> >    return order;
> > 
> > And we'll look in the fragment list[13] and since none of the groups
> > there would have 32768 blocks free (since we dont track it here) we'll
> > unnecessarily traverse the full list before falling to CR_BEST_AVAIL_LEN
> > (this will become a noop due to the way order and min_order
> > are calculated) and eventually to CR_GOAL_LEN_SLOW where we might get
> > something or end up splitting.
> That's not quite right, in ext4_mb_choose_next_group_goal_fast() even
> though we're looking for the group with order 13, the group with 32768
> free blocks is also in there. So after passing ext4_mb_good_group() in
> ext4_mb_find_good_group_avg_frag_lists(), we get a group with 32768
> free blocks. And in ext4_mb_choose_next_group_best_avail() we were

Hey Baokun,

So IIUC, a BG with 32768 blocks free will have bb_fragments = 0 and in
mb_update_avg_fragment_size() we exit early if a BG has bb_fragments = 0
hence it won't show up in the order 13 list.

> supposed to allocate blocks quickly by trim order, so it's necessary
> here too. So there are no unnecessary loops here.
> 
> But this will trigger the freshly added WARN_ON_ONCE, so in the
> new iteration I need to change it to:
> 
> if (WARN_ON_ONCE(order > MB_NUM_ORDERS(ac->ac_sb) + 1))
>         order = MB_NUM_ORDERS(ac->ac_sb) - 1;
> 
> In addition, when the block size is 4k, there are these limitations:
> 
> 1) Limit the maximum size of the data allocation estimate to 8M in
>     ext4_mb_normalize_request().
> 2) #define MAX_WRITEPAGES_EXTENT_LEN 2048
> 3) #define DIO_MAX_BLOCKS 4096
> 4) Metadata is generally not allocated in many blocks at a time
> 
> So it seems that only group_prealloc will allocate more than 2048
> blocks at a time.
> 
> And I've tried removing those 8M/2048/4096 limits before, but the
> performance of DIO write barely changed, and it doesn't look like
> the performance bottleneck is here in the number of blocks allocated
> at a time at the moment.

Ohh that's interesting, on paper I think it does seem like it should
improve the performance. I think if CR_GOAL_LEN_FAST can start including
blocks which are completely empty, and lift those restrictions then we
might see better performance. I'll try to play around a bit with this as
well.

Regards,
ojaswin

> 
> Thanks,
> Baokun
> > I think something more optimal would be to:
> > 
> > 1. Add another entry to average fragment lists for completely empty
> > groups. (As a sidenote i think we should use something like MB_NUM_FRAG_ORDER
> > instead of MB_NUM_ORDERS in calculating limits related to average
> > fragment lists since the NUM_ORDERS seems to be the buddy max order ie
> > 8192 blocks only valid for CR_POWER2 and shouldn't really limit the
> > fragment size lists)
> > 
> > 2. If we don't want to go with 1 (maybe there's some history for that),
> > then probably should exit early from CR_GOAL_LEN_FAST so that we don't
> > iterate there.
> > 
> > Would like to hear your thoughts on it Baokun, Jan.
> > 
> > Regards,
> > ojaswin
> >
Baokun Li March 20, 2024, 1:30 a.m. UTC | #6
On 2024/3/20 2:25, Ojaswin Mujoo wrote:
> On Tue, Mar 19, 2024 at 06:05:53PM +0800, Baokun Li wrote:
>> On 2024/3/18 20:39, Ojaswin Mujoo wrote:
>>> On Thu, Mar 14, 2024 at 10:09:01PM +0800, Baokun Li wrote:
>>>> --- a/fs/ext4/mballoc.c
>>>> +++ b/fs/ext4/mballoc.c
>>>> @@ -831,6 +831,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
>>>>       return 0;
>>>>     if (order == MB_NUM_ORDERS(sb))
>>>>       order--;
>>>> + if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
>>>> +   order = MB_NUM_ORDERS(sb) - 1;
>>> Hey Baokun,
>> Hi Ojaswin,
>>> Thanks for fixing this. This patch looks good to me, feel free to add:
>>>
>>> Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
>> Thanks for the review!
>>> my comments after this are less about the patch and more about some
>>> thoughts on the working of average fragment lists.
>>>
>>> So going through the v2 and this patch got me thinking about what really
>>> is going to happen when a user tries to allocate 32768 blocks which is also
>>> the maximum value we could have in say ac->ac_g_ex.fe_len.
>>>
>>> When this happens, ext4_mb_regular_allocator() will directly set the
>>> criteria as CR_GOAL_LEN_FAST. Now, we'll follow:
>>>
>>> ext4_mb_choose_next_group_goal_fast()
>>>     for (i=mb_avg_fragment_size_order(); i < MB_NUM_ORDERS; i++) { .. }
>>>
>>> Here, mb_avg_fragment_siz_order() will do something like:
>>>
>>>     order = fls(32768) - 2 = 14
>>>     ...
>>>     if (order == MB_NUM_ORDERS(sb))
>>>       order--;
>>>
>>>     return order;
>>>
>>> And we'll look in the fragment list[13] and since none of the groups
>>> there would have 32768 blocks free (since we dont track it here) we'll
>>> unnecessarily traverse the full list before falling to CR_BEST_AVAIL_LEN
>>> (this will become a noop due to the way order and min_order
>>> are calculated) and eventually to CR_GOAL_LEN_SLOW where we might get
>>> something or end up splitting.
>> That's not quite right, in ext4_mb_choose_next_group_goal_fast() even
>> though we're looking for the group with order 13, the group with 32768
>> free blocks is also in there. So after passing ext4_mb_good_group() in
>> ext4_mb_find_good_group_avg_frag_lists(), we get a group with 32768
>> free blocks. And in ext4_mb_choose_next_group_best_avail() we were
> Hey Baokun,
>
> So IIUC, a BG with 32768 blocks free will have bb_fragments = 0 and in
> mb_update_avg_fragment_size() we exit early if a BG has bb_fragments = 0
> hence it won't show up in the order 13 list.
Hello Ojaswin,

This sounded strange, so I added the following debugging information:

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c65fac9b8c72..c6ec423e2971 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1212,6 +1212,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
                         i = mb_find_next_zero_bit(bitmap, max, i);
         }
         grp->bb_fragments = fragments;
+       pr_err(">>> greoup: %u, bb_free: %d, bb_fragments: %d\n", 
grp->bb_group, grp->bb_free, grp->bb_fragments);

         if (free != grp->bb_free) {
                 ext4_grp_locked_error(sb, group, 0, 0,


Then mount an ext4 image , wait for a moment , and got the
following printout:

 >>> greoup: 6, bb_free: 32768, bb_fragments: 1
 >>> greoup: 5, bb_free: 31741, bb_fragments: 1
 >>> greoup: 4, bb_free: 32768, bb_fragments: 1
 >>> greoup: 3, bb_free: 31741, bb_fragments: 1
 >>> greoup: 2, bb_free: 32768, bb_fragments: 1
 >>> greoup: 1, bb_free: 31741, bb_fragments: 1
 >>> greoup: 0, bb_free: 23511, bb_fragments: 1
>> supposed to allocate blocks quickly by trim order, so it's necessary
>> here too. So there are no unnecessary loops here.
>>
>> But this will trigger the freshly added WARN_ON_ONCE, so in the
>> new iteration I need to change it to:
>>
>> if (WARN_ON_ONCE(order > MB_NUM_ORDERS(ac->ac_sb) + 1))
>>          order = MB_NUM_ORDERS(ac->ac_sb) - 1;
>>
>> In addition, when the block size is 4k, there are these limitations:
>>
>> 1) Limit the maximum size of the data allocation estimate to 8M in
>>      ext4_mb_normalize_request().
>> 2) #define MAX_WRITEPAGES_EXTENT_LEN 2048
>> 3) #define DIO_MAX_BLOCKS 4096
>> 4) Metadata is generally not allocated in many blocks at a time
>>
>> So it seems that only group_prealloc will allocate more than 2048
>> blocks at a time.
>>
>> And I've tried removing those 8M/2048/4096 limits before, but the
>> performance of DIO write barely changed, and it doesn't look like
>> the performance bottleneck is here in the number of blocks allocated
>> at a time at the moment.
> Ohh that's interesting, on paper I think it does seem like it should
> improve the performance. I think if CR_GOAL_LEN_FAST can start including
> blocks which are completely empty, and lift those restrictions then we
> might see better performance. I'll try to play around a bit with this as
> well.
>
> Regards,
> ojaswin
>
OK, waiting for your good news.
Ojaswin Mujoo March 20, 2024, 5:19 a.m. UTC | #7
On Wed, Mar 20, 2024 at 09:30:57AM +0800, Baokun Li wrote:
> On 2024/3/20 2:25, Ojaswin Mujoo wrote:
> > On Tue, Mar 19, 2024 at 06:05:53PM +0800, Baokun Li wrote:
> > > On 2024/3/18 20:39, Ojaswin Mujoo wrote:
> > > > On Thu, Mar 14, 2024 at 10:09:01PM +0800, Baokun Li wrote:
> > > > > --- a/fs/ext4/mballoc.c
> > > > > +++ b/fs/ext4/mballoc.c
> > > > > @@ -831,6 +831,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
> > > > >       return 0;
> > > > >     if (order == MB_NUM_ORDERS(sb))
> > > > >       order--;
> > > > > + if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
> > > > > +   order = MB_NUM_ORDERS(sb) - 1;
> > > > Hey Baokun,
> > > Hi Ojaswin,
> > > > Thanks for fixing this. This patch looks good to me, feel free to add:
> > > > 
> > > > Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> > > Thanks for the review!
> > > > my comments after this are less about the patch and more about some
> > > > thoughts on the working of average fragment lists.
> > > > 
> > > > So going through the v2 and this patch got me thinking about what really
> > > > is going to happen when a user tries to allocate 32768 blocks which is also
> > > > the maximum value we could have in say ac->ac_g_ex.fe_len.
> > > > 
> > > > When this happens, ext4_mb_regular_allocator() will directly set the
> > > > criteria as CR_GOAL_LEN_FAST. Now, we'll follow:
> > > > 
> > > > ext4_mb_choose_next_group_goal_fast()
> > > >     for (i=mb_avg_fragment_size_order(); i < MB_NUM_ORDERS; i++) { .. }
> > > > 
> > > > Here, mb_avg_fragment_siz_order() will do something like:
> > > > 
> > > >     order = fls(32768) - 2 = 14
> > > >     ...
> > > >     if (order == MB_NUM_ORDERS(sb))
> > > >       order--;
> > > > 
> > > >     return order;
> > > > 
> > > > And we'll look in the fragment list[13] and since none of the groups
> > > > there would have 32768 blocks free (since we dont track it here) we'll
> > > > unnecessarily traverse the full list before falling to CR_BEST_AVAIL_LEN
> > > > (this will become a noop due to the way order and min_order
> > > > are calculated) and eventually to CR_GOAL_LEN_SLOW where we might get
> > > > something or end up splitting.
> > > That's not quite right, in ext4_mb_choose_next_group_goal_fast() even
> > > though we're looking for the group with order 13, the group with 32768
> > > free blocks is also in there. So after passing ext4_mb_good_group() in
> > > ext4_mb_find_good_group_avg_frag_lists(), we get a group with 32768
> > > free blocks. And in ext4_mb_choose_next_group_best_avail() we were
> > Hey Baokun,
> > 
> > So IIUC, a BG with 32768 blocks free will have bb_fragments = 0 and in
> > mb_update_avg_fragment_size() we exit early if a BG has bb_fragments = 0
> > hence it won't show up in the order 13 list.
> Hello Ojaswin,
> 
> This sounded strange, so I added the following debugging information:
> 
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index c65fac9b8c72..c6ec423e2971 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -1212,6 +1212,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
>                         i = mb_find_next_zero_bit(bitmap, max, i);
>         }
>         grp->bb_fragments = fragments;
> +       pr_err(">>> greoup: %u, bb_free: %d, bb_fragments: %d\n",
> grp->bb_group, grp->bb_free, grp->bb_fragments);
> 
>         if (free != grp->bb_free) {
>                 ext4_grp_locked_error(sb, group, 0, 0,
> 
> 
> Then mount an ext4 image , wait for a moment , and got the
> following printout:
> 
> >>> greoup: 6, bb_free: 32768, bb_fragments: 1
> >>> greoup: 5, bb_free: 31741, bb_fragments: 1
> >>> greoup: 4, bb_free: 32768, bb_fragments: 1
> >>> greoup: 3, bb_free: 31741, bb_fragments: 1
> >>> greoup: 2, bb_free: 32768, bb_fragments: 1
> >>> greoup: 1, bb_free: 31741, bb_fragments: 1
> >>> greoup: 0, bb_free: 23511, bb_fragments: 1

Ahh right, the fragments would be 1 and not zero, my bad! Thanks
for testing this.

> > > supposed to allocate blocks quickly by trim order, so it's necessary
> > > here too. So there are no unnecessary loops here.
> > > 
> > > But this will trigger the freshly added WARN_ON_ONCE, so in the
> > > new iteration I need to change it to:
> > > 
> > > if (WARN_ON_ONCE(order > MB_NUM_ORDERS(ac->ac_sb) + 1))
> > >          order = MB_NUM_ORDERS(ac->ac_sb) - 1;
> > > 
> > > In addition, when the block size is 4k, there are these limitations:
> > > 
> > > 1) Limit the maximum size of the data allocation estimate to 8M in
> > >      ext4_mb_normalize_request().
> > > 2) #define MAX_WRITEPAGES_EXTENT_LEN 2048
> > > 3) #define DIO_MAX_BLOCKS 4096
> > > 4) Metadata is generally not allocated in many blocks at a time
> > > 
> > > So it seems that only group_prealloc will allocate more than 2048
> > > blocks at a time.
> > > 
> > > And I've tried removing those 8M/2048/4096 limits before, but the
> > > performance of DIO write barely changed, and it doesn't look like
> > > the performance bottleneck is here in the number of blocks allocated
> > > at a time at the moment.
> > Ohh that's interesting, on paper I think it does seem like it should
> > improve the performance. I think if CR_GOAL_LEN_FAST can start including
> > blocks which are completely empty, and lift those restrictions then we
> > might see better performance. I'll try to play around a bit with this as
> > well.
> > 
> > Regards,
> > ojaswin
> > 
> OK, waiting for your good news.

:) 
> 
> -- 
> With Best Regards,
> Baokun Li
> .
diff mbox series

Patch

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 12b3f196010b..48afe5aa228c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -831,6 +831,8 @@  static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
 		return 0;
 	if (order == MB_NUM_ORDERS(sb))
 		order--;
+	if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
+		order = MB_NUM_ORDERS(sb) - 1;
 	return order;
 }
 
@@ -1008,6 +1010,8 @@  static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
 	 * goal length.
 	 */
 	order = fls(ac->ac_g_ex.fe_len) - 1;
+	if (WARN_ON_ONCE(order > MB_NUM_ORDERS(ac->ac_sb) - 1))
+		order = MB_NUM_ORDERS(ac->ac_sb) - 1;
 	min_order = order - sbi->s_mb_best_avail_max_trim_order;
 	if (min_order < 0)
 		min_order = 0;
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 7f455b5f22c0..ddd71673176c 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -29,6 +29,7 @@  typedef enum {
 	attr_trigger_test_error,
 	attr_first_error_time,
 	attr_last_error_time,
+	attr_clusters_in_group,
 	attr_feature,
 	attr_pointer_ui,
 	attr_pointer_ul,
@@ -207,13 +208,14 @@  EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444);
 
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
 		 ext4_sb_info, s_inode_readahead_blks);
+EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group,
+		 ext4_sb_info, s_mb_group_prealloc);
 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
 EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
 EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
-EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
 EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups);
 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
 EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
@@ -376,6 +378,7 @@  static ssize_t ext4_generic_attr_show(struct ext4_attr *a,
 
 	switch (a->attr_id) {
 	case attr_inode_readahead:
+	case attr_clusters_in_group:
 	case attr_pointer_ui:
 		if (a->attr_ptr == ptr_ext4_super_block_offset)
 			return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr));
@@ -455,6 +458,14 @@  static ssize_t ext4_generic_attr_store(struct ext4_attr *a,
 		else
 			*((unsigned int *) ptr) = t;
 		return len;
+	case attr_clusters_in_group:
+		ret = kstrtouint(skip_spaces(buf), 0, &t);
+		if (ret)
+			return ret;
+		if (t > sbi->s_clusters_per_group)
+			return -EINVAL;
+		*((unsigned int *) ptr) = t;
+		return len;
 	case attr_pointer_ul:
 		ret = kstrtoul(skip_spaces(buf), 0, &lt);
 		if (ret)