Message ID | 9ba7e5de79b8b25e335026d57ec0640fc25e5ce0.1534905460.git.jaco@uls.co.za |
---|---|
State | New, archived |
Headers | show |
Series | Add block_high_watermark sysfs tunable. | expand |
Hi, The below is based on suggestion from Andreas. I opted for the sysfs tunable rather than an ioctl (which could also be used) because this enables to set the value without a special tool. Andreas mentioned a number of functions to adjust for allocating blocks, but only the two below seems to relate. I've also worked through all uses of ext4_blocks_count() and could not find further use cases that needs adjustment. Some of the uses is during mount only (cluster reservations), and since this change is per-mount and nt superblock persistent adjustments won't have any effect there. If this is desirable I'll try putting this in the superblock instead but this would require allocating feature bits and I'm not sure this change is worth a feature bit. I'll attempt limiting inode allocation in ialloc based on this next, just wanted to get feedback on the below first. My big question is this: how do I build a test case for this code? As an aside, the resize2fs from 64T to 56T eventually finished some time during the night. So just under 17 days total. If we can get an online resize for the same to be double or even triple that in total it'll still be a massive win for me. Even if we need to for the final stages take the filesystem offline for two days - I know a fsck on this system takes ~18 hours (will probably be a bit less now), so a shrink without having to move data blocks will take at least that time in all likelihood, a debugfs ncheck took ~12 hours on 64TB (which is needed to migrate inodes), icheck was ~12 minutes, most of the time spent to open the filesystem. Kind Regards, Jaco On 22/08/2018 04:21, Jaco Kroon wrote: > NOT READY FOR MERGE!!!!!! > > Limiting block allocations to a high watermark will eventually enable us > to perform online shrinks of an ext4 filesystem. As an immediate > benefit it'll prevent allocation of blocks in the high range, which if > performed as a precursor to an offline filesystem shrink will help to > reduce the overall time a filesystem needs to be taken offline in order > to shrink it. > > (possible) shortcomings: > > Currently this tunable does not get stored to the superblock, and thus > needs to be set again after each mount. > > The ext4_statfs function doesn't adjust the f_bavail value currently, as > such df will report incorrect results. > > The inode allocator hasn't been synced yet. > --- > fs/ext4/balloc.c | 2 +- > fs/ext4/ext4.h | 10 ++++++++++ > fs/ext4/mballoc.c | 2 +- > fs/ext4/sysfs.c | 19 +++++++++++++++++++ > 4 files changed, 31 insertions(+), 2 deletions(-) > > diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c > index e5d6ee61ff48..4f723c7a9c88 100644 > --- a/fs/ext4/balloc.c > +++ b/fs/ext4/balloc.c > @@ -883,7 +883,7 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) > block_group++; > } > bg_start = ext4_group_first_block_no(inode->i_sb, block_group); > - last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; > + last_block = ext4_blocks_max_allocatable(EXT4_SB(inode->i_sb)) - 1; > > /* > * If we are doing delayed allocation, we don't need take > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 0f0edd1cd0cd..dc30ea107c55 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -1423,6 +1423,7 @@ struct ext4_sb_info { > unsigned int s_mb_order2_reqs; > unsigned int s_mb_group_prealloc; > unsigned int s_max_dir_size_kb; > + ext4_fsblk_t s_block_high_watermark; /* allocators must not allocate blocks above this */ > /* where last allocation was done - for stream allocation */ > unsigned long s_mb_last_group; > unsigned long s_mb_last_start; > @@ -2711,6 +2712,15 @@ static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) > le32_to_cpu(es->s_blocks_count_lo); > } > > +static inline ext4_fsblk_t ext4_blocks_max_allocatable(struct ext4_sb_info *sbi) > +{ > + ext4_fsblk_t blocks = ext4_blocks_count(sbi->s_es); > + if (sbi->s_block_high_watermark && sbi->s_block_high_watermark < blocks) > + return sbi->s_block_high_watermark; > + else > + return blocks; > +} > + > static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) > { > return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) | > diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c > index e29fce2fbf25..a158c2c9de10 100644 > --- a/fs/ext4/mballoc.c > +++ b/fs/ext4/mballoc.c > @@ -4232,7 +4232,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, > /* start searching from the goal */ > goal = ar->goal; > if (goal < le32_to_cpu(es->s_first_data_block) || > - goal >= ext4_blocks_count(es)) > + goal >= ext4_blocks_max_allocatable(sbi)) > goal = le32_to_cpu(es->s_first_data_block); > ext4_get_group_no_and_offset(sb, goal, &group, &block); > > diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c > index 9212a026a1f1..2a1a955c2c0b 100644 > --- a/fs/ext4/sysfs.c > +++ b/fs/ext4/sysfs.c > @@ -30,6 +30,7 @@ typedef enum { > attr_feature, > attr_pointer_ui, > attr_pointer_atomic, > + attr_block_high_watermark, > } attr_id_t; > > typedef enum { > @@ -167,6 +168,7 @@ EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444); > EXT4_ATTR_FUNC(session_write_kbytes, 0444); > EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444); > EXT4_ATTR_FUNC(reserved_clusters, 0644); > +EXT4_ATTR_FUNC(block_high_watermark, 0600); > > EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, > ext4_sb_info, s_inode_readahead_blks); > @@ -217,6 +219,7 @@ static struct attribute *ext4_attrs[] = { > ATTR_LIST(errors_count), > ATTR_LIST(first_error_time), > ATTR_LIST(last_error_time), > + ATTR_LIST(block_high_watermark), > NULL, > }; > > @@ -304,6 +307,9 @@ static ssize_t ext4_attr_show(struct kobject *kobj, > return print_tstamp(buf, sbi->s_es, s_first_error_time); > case attr_last_error_time: > return print_tstamp(buf, sbi->s_es, s_last_error_time); > + case attr_block_high_watermark: > + return snprintf(buf, PAGE_SIZE, "%llu\n", > + (s64) sbi->s_block_high_watermark); > } > > return 0; > @@ -318,6 +324,7 @@ static ssize_t ext4_attr_store(struct kobject *kobj, > struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); > void *ptr = calc_ptr(a, sbi); > unsigned long t; > + unsigned long long t2; > int ret; > > switch (a->attr_id) { > @@ -338,6 +345,18 @@ static ssize_t ext4_attr_store(struct kobject *kobj, > return inode_readahead_blks_store(sbi, buf, len); > case attr_trigger_test_error: > return trigger_test_error(sbi, buf, len); > + case attr_block_high_watermark: > + if (!ptr) > + return 0; > + ret = kstrtoull(skip_spaces(buf), 0, &t2); > + if (ret) > + return ret; > + if (t2 > ext4_blocks_count(sbi->s_es)) > + return -EINVAL; > + if (t2 && t2 < le32_to_cpu(sbi->s_es->s_first_data_block)) > + return -EINVAL; > + sbi->s_block_high_watermark = t2; > + return len; > } > return 0; > }
On Aug 21, 2018, at 8:57 PM, Jaco Kroon <jaco@uls.co.za> wrote: > > Hi, > > The below is based on suggestion from Andreas. I opted for the sysfs > tunable rather than an ioctl (which could also be used) because this > enables to set the value without a special tool. > > Andreas mentioned a number of functions to adjust for allocating blocks, > but only the two below seems to relate. I've also worked through all > uses of ext4_blocks_count() and could not find further use cases that > needs adjustment. Some of the uses is during mount only (cluster > reservations), and since this change is per-mount and nt superblock > persistent adjustments won't have any effect there. If this is > desirable I'll try putting this in the superblock instead but this would > require allocating feature bits and I'm not sure this change is worth a > feature bit. Haven't had a chance to look at the patch yet, but thanks for submitting. One comment below... > I'll attempt limiting inode allocation in ialloc based on this next, > just wanted to get feedback on the below first. > > My big question is this: how do I build a test case for this code? > > As an aside, the resize2fs from 64T to 56T eventually finished some time > during the night. So just under 17 days total. If we can get an online > resize for the same to be double or even triple that in total it'll > still be a massive win for me. Even if we need to for the final stages > take the filesystem offline for two days - I know a fsck on this system > takes ~18 hours (will probably be a bit less now), so a shrink without > having to move data blocks will take at least that time in all > likelihood, a debugfs ncheck took ~12 hours on 64TB (which is needed to > migrate inodes), icheck was ~12 minutes, most of the time spent to open > the filesystem. Note that if opening the filesystem takes a long time, you can use "debugfs -c" to skip loading the block and inode bitmaps, which can speed things up significantly. That doesn't work for everything (definitely not filesystem-modifying operations) but for many read-only operations it is very useful. Cheers, Andreas > On 22/08/2018 04:21, Jaco Kroon wrote: >> NOT READY FOR MERGE!!!!!! >> >> Limiting block allocations to a high watermark will eventually enable us >> to perform online shrinks of an ext4 filesystem. As an immediate >> benefit it'll prevent allocation of blocks in the high range, which if >> performed as a precursor to an offline filesystem shrink will help to >> reduce the overall time a filesystem needs to be taken offline in order >> to shrink it. >> >> (possible) shortcomings: >> >> Currently this tunable does not get stored to the superblock, and thus >> needs to be set again after each mount. >> >> The ext4_statfs function doesn't adjust the f_bavail value currently, as >> such df will report incorrect results. >> >> The inode allocator hasn't been synced yet. >> --- >> fs/ext4/balloc.c | 2 +- >> fs/ext4/ext4.h | 10 ++++++++++ >> fs/ext4/mballoc.c | 2 +- >> fs/ext4/sysfs.c | 19 +++++++++++++++++++ >> 4 files changed, 31 insertions(+), 2 deletions(-) >> >> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c >> index e5d6ee61ff48..4f723c7a9c88 100644 >> --- a/fs/ext4/balloc.c >> +++ b/fs/ext4/balloc.c >> @@ -883,7 +883,7 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) >> block_group++; >> } >> bg_start = ext4_group_first_block_no(inode->i_sb, block_group); >> - last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; >> + last_block = ext4_blocks_max_allocatable(EXT4_SB(inode->i_sb)) - 1; >> >> /* >> * If we are doing delayed allocation, we don't need take >> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h >> index 0f0edd1cd0cd..dc30ea107c55 100644 >> --- a/fs/ext4/ext4.h >> +++ b/fs/ext4/ext4.h >> @@ -1423,6 +1423,7 @@ struct ext4_sb_info { >> unsigned int s_mb_order2_reqs; >> unsigned int s_mb_group_prealloc; >> unsigned int s_max_dir_size_kb; >> + ext4_fsblk_t s_block_high_watermark; /* allocators must not allocate blocks above this */ >> /* where last allocation was done - for stream allocation */ >> unsigned long s_mb_last_group; >> unsigned long s_mb_last_start; >> @@ -2711,6 +2712,15 @@ static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) >> le32_to_cpu(es->s_blocks_count_lo); >> } >> >> +static inline ext4_fsblk_t ext4_blocks_max_allocatable(struct ext4_sb_info *sbi) >> +{ >> + ext4_fsblk_t blocks = ext4_blocks_count(sbi->s_es); >> + if (sbi->s_block_high_watermark && sbi->s_block_high_watermark < blocks) >> + return sbi->s_block_high_watermark; >> + else >> + return blocks; >> +} >> + >> static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) >> { >> return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) | >> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c >> index e29fce2fbf25..a158c2c9de10 100644 >> --- a/fs/ext4/mballoc.c >> +++ b/fs/ext4/mballoc.c >> @@ -4232,7 +4232,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, >> /* start searching from the goal */ >> goal = ar->goal; >> if (goal < le32_to_cpu(es->s_first_data_block) || >> - goal >= ext4_blocks_count(es)) >> + goal >= ext4_blocks_max_allocatable(sbi)) >> goal = le32_to_cpu(es->s_first_data_block); >> ext4_get_group_no_and_offset(sb, goal, &group, &block); >> >> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c >> index 9212a026a1f1..2a1a955c2c0b 100644 >> --- a/fs/ext4/sysfs.c >> +++ b/fs/ext4/sysfs.c >> @@ -30,6 +30,7 @@ typedef enum { >> attr_feature, >> attr_pointer_ui, >> attr_pointer_atomic, >> + attr_block_high_watermark, >> } attr_id_t; >> >> typedef enum { >> @@ -167,6 +168,7 @@ EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444); >> EXT4_ATTR_FUNC(session_write_kbytes, 0444); >> EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444); >> EXT4_ATTR_FUNC(reserved_clusters, 0644); >> +EXT4_ATTR_FUNC(block_high_watermark, 0600); >> >> EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, >> ext4_sb_info, s_inode_readahead_blks); >> @@ -217,6 +219,7 @@ static struct attribute *ext4_attrs[] = { >> ATTR_LIST(errors_count), >> ATTR_LIST(first_error_time), >> ATTR_LIST(last_error_time), >> + ATTR_LIST(block_high_watermark), >> NULL, >> }; >> >> @@ -304,6 +307,9 @@ static ssize_t ext4_attr_show(struct kobject *kobj, >> return print_tstamp(buf, sbi->s_es, s_first_error_time); >> case attr_last_error_time: >> return print_tstamp(buf, sbi->s_es, s_last_error_time); >> + case attr_block_high_watermark: >> + return snprintf(buf, PAGE_SIZE, "%llu\n", >> + (s64) sbi->s_block_high_watermark); >> } >> >> return 0; >> @@ -318,6 +324,7 @@ static ssize_t ext4_attr_store(struct kobject *kobj, >> struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); >> void *ptr = calc_ptr(a, sbi); >> unsigned long t; >> + unsigned long long t2; >> int ret; >> >> switch (a->attr_id) { >> @@ -338,6 +345,18 @@ static ssize_t ext4_attr_store(struct kobject *kobj, >> return inode_readahead_blks_store(sbi, buf, len); >> case attr_trigger_test_error: >> return trigger_test_error(sbi, buf, len); >> + case attr_block_high_watermark: >> + if (!ptr) >> + return 0; >> + ret = kstrtoull(skip_spaces(buf), 0, &t2); >> + if (ret) >> + return ret; >> + if (t2 > ext4_blocks_count(sbi->s_es)) >> + return -EINVAL; >> + if (t2 && t2 < le32_to_cpu(sbi->s_es->s_first_data_block)) >> + return -EINVAL; >> + sbi->s_block_high_watermark = t2; >> + return len; >> } >> return 0; >> } > Cheers, Andreas
On Aug 21, 2018, at 8:21 PM, Jaco Kroon <jaco@uls.co.za> wrote: > > NOT READY FOR MERGE!!!!!! > > Limiting block allocations to a high watermark will eventually enable us > to perform online shrinks of an ext4 filesystem. As an immediate > benefit it'll prevent allocation of blocks in the high range, which if > performed as a precursor to an offline filesystem shrink will help to > reduce the overall time a filesystem needs to be taken offline in order > to shrink it. > > (possible) shortcomings: > > Currently this tunable does not get stored to the superblock, and thus > needs to be set again after each mount. > > The ext4_statfs function doesn't adjust the f_bavail value currently, as > such df will report incorrect results. > > The inode allocator hasn't been synced yet. Hi Jaco, sorry for the extreme delay in replying to this. It was lost in my inbox and I only just found it now. Looking through the patch, it does seem OK for the basic functionality intended, and would at least allow you to reduce the number of blocks allocated at the end of the device, meaning that the offline shrink would take less time (ideally none if all of the files are removed from the end of the device). With this first patch it should be possible to do an "online shrink" by setting the high watermark, then walking the filesystem checking for any files have blocks beyond the HWM via "filefrag -v" and running e4defrag on those files. This should be largely transparent to userspace. The current patch would not allow directly limiting inode allocation, but using the "inode_goal" tunable could be used to influence the inode selection to allow "mkdir + rsync + mv" to move directory trees to lower inodes. Only files currently open for write would not be safe to move to new inodes. I think for fully using this functionality in the kernel/e2fsprogs a few more additions are needed, as you mentioned above: - store the high watermark in the superblock via tune2fs, so that it is not lost if the system is rebooted or filesystem remounted - fix ext4_statfs() to adjust available blocks appropriately - avoid allocating inodes in blocks above the high watermark Typically, using tune2fs to adjust a mounted filesystem should change the value used by the kernel, so also having a /sys tunable gets tricky. One option would be to leave "sbi->s_block_high_watermark = 0" and use the superblock value if the sbi->s_block_high_watermark == 0, and only use sbi->s_block_high_watermark if it is set directly? Something like: static inline ext4_fsblk_t ext4_blocks_max_allocatable(struct ext4_sb_info *sbi) { ext4_fsblk_t blocks = ext4_blocks_count(sbi->s_es); if (unlikely(sbi->s_block_high_watermark && sbi->s_block_high_watermark < blocks)) return sbi->s_block_high_watermark; if (unlikely(sbi->s_es->s_blk_high_watermark && le64_to_cpu(sbi->s_es->s_blk_high_watermark) < blocks) return le64_to_cpu(sbi->s_es->s_blk_high_watermark); return blocks; } this adds a bit more runtime overhead vs. setting s_block_high_watermark from the superblock at mount time, but is more flexible. For ext4_statfs() do we subtract only the free blocks beyond HWM from the available count, or all blocks? Subtracting the difference between ext4_blocks_count() and ext4_blocks_max_allocatable() is easy (zero if no high watermark), but the available blocks should not be negative if there are lots of blocks used beyond the HWM and few free below it. Better would be if the available blocks would report the free blocks below the HWM, but this would involve subtracting free blocks above the HWM and adjusting this as blocks above the limit are freed. For the inode allocation limit, it is fairly straight forward to map the block HWM to an inode HWM based on the group descriptor that the HWM is in. For future use (dynamic inode tables) it may be desirable to also have a separately tunable inode HWM, but it could also be done later as needed. On the e2fsprogs side, there should be a "-E block_high_watermark=N" tunable added to set the field in the superblock, and support to print it in dumpe2fs and modify it in dumpe2fs via "ssv". It may be useful to add a "-f" force flag to e4defrag so that it moves inodes even if they are not less fragmented afterward, so blocks beyond the HWM are always freed. Alternately, block and inode move (for closed files) might be implemented in userspace via resize2fs (essentially cp+rename) when it is doing an online shrink of the filesystem? That might be simpler from a user point of view instead of needing to run e4defrag manuall that needs to be scripted to find the files to be moved. Optionally, should there be a "hard" and a "soft" block limit? For example, if the high watermark is set to a negative value -blocks it is a soft limit (prefer lower allocation, but can exceed it if filesystem is full), or have a separate "soft" flag stored somewhere else? In the first case, we should mask off the high bit when accessing this field, and use it only for deciding if allocation can continue after a normal scan failed. In the longer term, the resize ioctl could be enhanced to drop the last group(s) if they are above the high watermark and have no used blocks/inodes. The resize2fs tool could report if trying to shrink a filesystem with in-use blocks that the HWM will be set and file migration is needed, then do the online migration (reporting any files that are open via lsof) and returning an error in the end that which processes are blocking the resize. Some minor nits in the patch inline below: > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 0f0edd1cd0cd..dc30ea107c55 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -1423,6 +1423,7 @@ struct ext4_sb_info { > unsigned int s_mb_order2_reqs; > unsigned int s_mb_group_prealloc; > unsigned int s_max_dir_size_kb; > + ext4_fsblk_t s_block_high_watermark; /* allocators must not allocate blocks above this */ (style) should stay under 80 columns. Easiest to just shorten comment to something like "/* max allocatable block number */" or similar. > @@ -2711,6 +2712,15 @@ static inline ext4_fsblk_t ext4_blocks_count(struct +static inline ext4_fsblk_t ext4_blocks_max_allocatable(struct ext4_sb_info *sbi) > +{ > + ext4_fsblk_t blocks = ext4_blocks_count(sbi->s_es); (style) blank line after variable declarations > + if (sbi->s_block_high_watermark && sbi->s_block_high_watermark < blocks) > + return sbi->s_block_high_watermark; > + else > + return blocks; (style) no need for "else" after "return". > diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c > index 9212a026a1f1..2a1a955c2c0b 100644 > --- a/fs/ext4/sysfs.c > +++ b/fs/ext4/sysfs.c > @@ -304,6 +307,9 @@ static ssize_t ext4_attr_show(struct kobject *kobj, > return print_tstamp(buf, sbi->s_es, s_first_error_time); > case attr_last_error_time: > return print_tstamp(buf, sbi->s_es, s_last_error_time); > + case attr_block_high_watermark: > + return snprintf(buf, PAGE_SIZE, "%llu\n", > + (s64) sbi->s_block_high_watermark); (style) no space after typecast Cheers, Andreas
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index e5d6ee61ff48..4f723c7a9c88 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -883,7 +883,7 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) block_group++; } bg_start = ext4_group_first_block_no(inode->i_sb, block_group); - last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + last_block = ext4_blocks_max_allocatable(EXT4_SB(inode->i_sb)) - 1; /* * If we are doing delayed allocation, we don't need take diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0f0edd1cd0cd..dc30ea107c55 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1423,6 +1423,7 @@ struct ext4_sb_info { unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; unsigned int s_max_dir_size_kb; + ext4_fsblk_t s_block_high_watermark; /* allocators must not allocate blocks above this */ /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; unsigned long s_mb_last_start; @@ -2711,6 +2712,15 @@ static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) le32_to_cpu(es->s_blocks_count_lo); } +static inline ext4_fsblk_t ext4_blocks_max_allocatable(struct ext4_sb_info *sbi) +{ + ext4_fsblk_t blocks = ext4_blocks_count(sbi->s_es); + if (sbi->s_block_high_watermark && sbi->s_block_high_watermark < blocks) + return sbi->s_block_high_watermark; + else + return blocks; +} + static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) { return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) | diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e29fce2fbf25..a158c2c9de10 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4232,7 +4232,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, /* start searching from the goal */ goal = ar->goal; if (goal < le32_to_cpu(es->s_first_data_block) || - goal >= ext4_blocks_count(es)) + goal >= ext4_blocks_max_allocatable(sbi)) goal = le32_to_cpu(es->s_first_data_block); ext4_get_group_no_and_offset(sb, goal, &group, &block); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 9212a026a1f1..2a1a955c2c0b 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -30,6 +30,7 @@ typedef enum { attr_feature, attr_pointer_ui, attr_pointer_atomic, + attr_block_high_watermark, } attr_id_t; typedef enum { @@ -167,6 +168,7 @@ EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444); EXT4_ATTR_FUNC(session_write_kbytes, 0444); EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444); EXT4_ATTR_FUNC(reserved_clusters, 0644); +EXT4_ATTR_FUNC(block_high_watermark, 0600); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, ext4_sb_info, s_inode_readahead_blks); @@ -217,6 +219,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(errors_count), ATTR_LIST(first_error_time), ATTR_LIST(last_error_time), + ATTR_LIST(block_high_watermark), NULL, }; @@ -304,6 +307,9 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return print_tstamp(buf, sbi->s_es, s_first_error_time); case attr_last_error_time: return print_tstamp(buf, sbi->s_es, s_last_error_time); + case attr_block_high_watermark: + return snprintf(buf, PAGE_SIZE, "%llu\n", + (s64) sbi->s_block_high_watermark); } return 0; @@ -318,6 +324,7 @@ static ssize_t ext4_attr_store(struct kobject *kobj, struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); void *ptr = calc_ptr(a, sbi); unsigned long t; + unsigned long long t2; int ret; switch (a->attr_id) { @@ -338,6 +345,18 @@ static ssize_t ext4_attr_store(struct kobject *kobj, return inode_readahead_blks_store(sbi, buf, len); case attr_trigger_test_error: return trigger_test_error(sbi, buf, len); + case attr_block_high_watermark: + if (!ptr) + return 0; + ret = kstrtoull(skip_spaces(buf), 0, &t2); + if (ret) + return ret; + if (t2 > ext4_blocks_count(sbi->s_es)) + return -EINVAL; + if (t2 && t2 < le32_to_cpu(sbi->s_es->s_first_data_block)) + return -EINVAL; + sbi->s_block_high_watermark = t2; + return len; } return 0; }