Add block_high_watermark sysfs tunable.

Message ID 9ba7e5de79b8b25e335026d57ec0640fc25e5ce0.1534905460.git.jaco@uls.co.za
State New
Headers show
Series
  • Add block_high_watermark sysfs tunable.
Related show

Commit Message

Jaco Kroon Aug. 22, 2018, 2:21 a.m.
NOT READY FOR MERGE!!!!!!

Limiting block allocations to a high watermark will eventually enable us
to perform online shrinks of an ext4 filesystem.  As an immediate
benefit it'll prevent allocation of blocks in the high range, which if
performed as a precursor to an offline filesystem shrink will help to
reduce the overall time a filesystem needs to be taken offline in order
to shrink it.

(possible) shortcomings:

Currently this tunable does not get stored to the superblock, and thus
needs to be set again after each mount.

The ext4_statfs function doesn't adjust the f_bavail value currently, as
such df will report incorrect results.

The inode allocator hasn't been synced yet.
---
 fs/ext4/balloc.c  |  2 +-
 fs/ext4/ext4.h    | 10 ++++++++++
 fs/ext4/mballoc.c |  2 +-
 fs/ext4/sysfs.c   | 19 +++++++++++++++++++
 4 files changed, 31 insertions(+), 2 deletions(-)

Comments

Jaco Kroon Aug. 22, 2018, 2:57 a.m. | #1
Hi,

The below is based on suggestion from Andreas.  I opted for the sysfs
tunable rather than an ioctl (which could also be used) because this
enables to set the value without a special tool.

Andreas mentioned a number of functions to adjust for allocating blocks,
but only the two below seems to relate.  I've also worked through all
uses of ext4_blocks_count() and could not find further use cases that
needs adjustment.  Some of the uses is during mount only (cluster
reservations), and since this change is per-mount and nt superblock
persistent adjustments won't have any effect there.  If this is
desirable I'll try putting this in the superblock instead but this would
require allocating feature bits and I'm not sure this change is worth a
feature bit.

I'll attempt limiting inode allocation in ialloc based on this next,
just wanted to get feedback on the below first.

My big question is this:  how do I build a test case for this code?

As an aside, the resize2fs from 64T to 56T eventually finished some time
during the night.  So just under 17 days total.  If we can get an online
resize for the same to be double or even triple that in total it'll
still be a massive win for me.  Even if we need to for the final stages
take the filesystem offline for two days - I know a fsck on this system
takes ~18 hours (will probably be a bit less now), so a shrink without
having to move data blocks will take at least that time in all
likelihood, a debugfs ncheck took ~12 hours on 64TB (which is needed to
migrate inodes), icheck was ~12 minutes, most of the time spent to open
the filesystem.

Kind Regards,
Jaco

On 22/08/2018 04:21, Jaco Kroon wrote:
> NOT READY FOR MERGE!!!!!!
>
> Limiting block allocations to a high watermark will eventually enable us
> to perform online shrinks of an ext4 filesystem.  As an immediate
> benefit it'll prevent allocation of blocks in the high range, which if
> performed as a precursor to an offline filesystem shrink will help to
> reduce the overall time a filesystem needs to be taken offline in order
> to shrink it.
>
> (possible) shortcomings:
>
> Currently this tunable does not get stored to the superblock, and thus
> needs to be set again after each mount.
>
> The ext4_statfs function doesn't adjust the f_bavail value currently, as
> such df will report incorrect results.
>
> The inode allocator hasn't been synced yet.
> ---
>  fs/ext4/balloc.c  |  2 +-
>  fs/ext4/ext4.h    | 10 ++++++++++
>  fs/ext4/mballoc.c |  2 +-
>  fs/ext4/sysfs.c   | 19 +++++++++++++++++++
>  4 files changed, 31 insertions(+), 2 deletions(-)
>
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index e5d6ee61ff48..4f723c7a9c88 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -883,7 +883,7 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
>  			block_group++;
>  	}
>  	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
> -	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
> +	last_block = ext4_blocks_max_allocatable(EXT4_SB(inode->i_sb)) - 1;
>  
>  	/*
>  	 * If we are doing delayed allocation, we don't need take
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 0f0edd1cd0cd..dc30ea107c55 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1423,6 +1423,7 @@ struct ext4_sb_info {
>  	unsigned int s_mb_order2_reqs;
>  	unsigned int s_mb_group_prealloc;
>  	unsigned int s_max_dir_size_kb;
> +	ext4_fsblk_t s_block_high_watermark; /* allocators must not allocate blocks above this */
>  	/* where last allocation was done - for stream allocation */
>  	unsigned long s_mb_last_group;
>  	unsigned long s_mb_last_start;
> @@ -2711,6 +2712,15 @@ static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
>  		le32_to_cpu(es->s_blocks_count_lo);
>  }
>  
> +static inline ext4_fsblk_t ext4_blocks_max_allocatable(struct ext4_sb_info *sbi)
> +{
> +	ext4_fsblk_t blocks = ext4_blocks_count(sbi->s_es);
> +	if (sbi->s_block_high_watermark && sbi->s_block_high_watermark < blocks)
> +		return sbi->s_block_high_watermark;
> +	else
> +		return blocks;
> +}
> +
>  static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
>  {
>  	return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) |
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index e29fce2fbf25..a158c2c9de10 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -4232,7 +4232,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
>  	/* start searching from the goal */
>  	goal = ar->goal;
>  	if (goal < le32_to_cpu(es->s_first_data_block) ||
> -			goal >= ext4_blocks_count(es))
> +			goal >= ext4_blocks_max_allocatable(sbi))
>  		goal = le32_to_cpu(es->s_first_data_block);
>  	ext4_get_group_no_and_offset(sb, goal, &group, &block);
>  
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index 9212a026a1f1..2a1a955c2c0b 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -30,6 +30,7 @@ typedef enum {
>  	attr_feature,
>  	attr_pointer_ui,
>  	attr_pointer_atomic,
> +	attr_block_high_watermark,
>  } attr_id_t;
>  
>  typedef enum {
> @@ -167,6 +168,7 @@ EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
>  EXT4_ATTR_FUNC(session_write_kbytes, 0444);
>  EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
>  EXT4_ATTR_FUNC(reserved_clusters, 0644);
> +EXT4_ATTR_FUNC(block_high_watermark, 0600);
>  
>  EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
>  		 ext4_sb_info, s_inode_readahead_blks);
> @@ -217,6 +219,7 @@ static struct attribute *ext4_attrs[] = {
>  	ATTR_LIST(errors_count),
>  	ATTR_LIST(first_error_time),
>  	ATTR_LIST(last_error_time),
> +	ATTR_LIST(block_high_watermark),
>  	NULL,
>  };
>  
> @@ -304,6 +307,9 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
>  		return print_tstamp(buf, sbi->s_es, s_first_error_time);
>  	case attr_last_error_time:
>  		return print_tstamp(buf, sbi->s_es, s_last_error_time);
> +	case attr_block_high_watermark:
> +		return snprintf(buf, PAGE_SIZE, "%llu\n",
> +				(s64) sbi->s_block_high_watermark);
>  	}
>  
>  	return 0;
> @@ -318,6 +324,7 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
>  	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
>  	void *ptr = calc_ptr(a, sbi);
>  	unsigned long t;
> +	unsigned long long t2;
>  	int ret;
>  
>  	switch (a->attr_id) {
> @@ -338,6 +345,18 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
>  		return inode_readahead_blks_store(sbi, buf, len);
>  	case attr_trigger_test_error:
>  		return trigger_test_error(sbi, buf, len);
> +	case attr_block_high_watermark:
> +		if (!ptr)
> +			return 0;
> +		ret = kstrtoull(skip_spaces(buf), 0, &t2);
> +		if (ret)
> +			return ret;
> +		if (t2 > ext4_blocks_count(sbi->s_es))
> +			return -EINVAL;
> +		if (t2 && t2 < le32_to_cpu(sbi->s_es->s_first_data_block))
> +			return -EINVAL;
> +		sbi->s_block_high_watermark = t2;
> +		return len;
>  	}
>  	return 0;
>  }
Andreas Dilger Aug. 22, 2018, 5:02 p.m. | #2
On Aug 21, 2018, at 8:57 PM, Jaco Kroon <jaco@uls.co.za> wrote:
> 
> Hi,
> 
> The below is based on suggestion from Andreas.  I opted for the sysfs
> tunable rather than an ioctl (which could also be used) because this
> enables to set the value without a special tool.
> 
> Andreas mentioned a number of functions to adjust for allocating blocks,
> but only the two below seems to relate.  I've also worked through all
> uses of ext4_blocks_count() and could not find further use cases that
> needs adjustment.  Some of the uses is during mount only (cluster
> reservations), and since this change is per-mount and nt superblock
> persistent adjustments won't have any effect there.  If this is
> desirable I'll try putting this in the superblock instead but this would
> require allocating feature bits and I'm not sure this change is worth a
> feature bit.

Haven't had a chance to look at the patch yet, but thanks for submitting.
One comment below...

> I'll attempt limiting inode allocation in ialloc based on this next,
> just wanted to get feedback on the below first.
> 
> My big question is this:  how do I build a test case for this code?
> 
> As an aside, the resize2fs from 64T to 56T eventually finished some time
> during the night.  So just under 17 days total.  If we can get an online
> resize for the same to be double or even triple that in total it'll
> still be a massive win for me.  Even if we need to for the final stages
> take the filesystem offline for two days - I know a fsck on this system
> takes ~18 hours (will probably be a bit less now), so a shrink without
> having to move data blocks will take at least that time in all
> likelihood, a debugfs ncheck took ~12 hours on 64TB (which is needed to
> migrate inodes), icheck was ~12 minutes, most of the time spent to open
> the filesystem.

Note that if opening the filesystem takes a long time, you can use
"debugfs -c" to skip loading the block and inode bitmaps, which can
speed things up significantly.  That doesn't work for everything
(definitely not filesystem-modifying operations) but for many read-only
operations it is very useful.

Cheers, Andreas

> On 22/08/2018 04:21, Jaco Kroon wrote:
>> NOT READY FOR MERGE!!!!!!
>> 
>> Limiting block allocations to a high watermark will eventually enable us
>> to perform online shrinks of an ext4 filesystem.  As an immediate
>> benefit it'll prevent allocation of blocks in the high range, which if
>> performed as a precursor to an offline filesystem shrink will help to
>> reduce the overall time a filesystem needs to be taken offline in order
>> to shrink it.
>> 
>> (possible) shortcomings:
>> 
>> Currently this tunable does not get stored to the superblock, and thus
>> needs to be set again after each mount.
>> 
>> The ext4_statfs function doesn't adjust the f_bavail value currently, as
>> such df will report incorrect results.
>> 
>> The inode allocator hasn't been synced yet.
>> ---
>> fs/ext4/balloc.c  |  2 +-
>> fs/ext4/ext4.h    | 10 ++++++++++
>> fs/ext4/mballoc.c |  2 +-
>> fs/ext4/sysfs.c   | 19 +++++++++++++++++++
>> 4 files changed, 31 insertions(+), 2 deletions(-)
>> 
>> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
>> index e5d6ee61ff48..4f723c7a9c88 100644
>> --- a/fs/ext4/balloc.c
>> +++ b/fs/ext4/balloc.c
>> @@ -883,7 +883,7 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
>> 			block_group++;
>> 	}
>> 	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
>> -	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
>> +	last_block = ext4_blocks_max_allocatable(EXT4_SB(inode->i_sb)) - 1;
>> 
>> 	/*
>> 	 * If we are doing delayed allocation, we don't need take
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 0f0edd1cd0cd..dc30ea107c55 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -1423,6 +1423,7 @@ struct ext4_sb_info {
>> 	unsigned int s_mb_order2_reqs;
>> 	unsigned int s_mb_group_prealloc;
>> 	unsigned int s_max_dir_size_kb;
>> +	ext4_fsblk_t s_block_high_watermark; /* allocators must not allocate blocks above this */
>> 	/* where last allocation was done - for stream allocation */
>> 	unsigned long s_mb_last_group;
>> 	unsigned long s_mb_last_start;
>> @@ -2711,6 +2712,15 @@ static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
>> 		le32_to_cpu(es->s_blocks_count_lo);
>> }
>> 
>> +static inline ext4_fsblk_t ext4_blocks_max_allocatable(struct ext4_sb_info *sbi)
>> +{
>> +	ext4_fsblk_t blocks = ext4_blocks_count(sbi->s_es);
>> +	if (sbi->s_block_high_watermark && sbi->s_block_high_watermark < blocks)
>> +		return sbi->s_block_high_watermark;
>> +	else
>> +		return blocks;
>> +}
>> +
>> static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
>> {
>> 	return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) |
>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>> index e29fce2fbf25..a158c2c9de10 100644
>> --- a/fs/ext4/mballoc.c
>> +++ b/fs/ext4/mballoc.c
>> @@ -4232,7 +4232,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
>> 	/* start searching from the goal */
>> 	goal = ar->goal;
>> 	if (goal < le32_to_cpu(es->s_first_data_block) ||
>> -			goal >= ext4_blocks_count(es))
>> +			goal >= ext4_blocks_max_allocatable(sbi))
>> 		goal = le32_to_cpu(es->s_first_data_block);
>> 	ext4_get_group_no_and_offset(sb, goal, &group, &block);
>> 
>> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
>> index 9212a026a1f1..2a1a955c2c0b 100644
>> --- a/fs/ext4/sysfs.c
>> +++ b/fs/ext4/sysfs.c
>> @@ -30,6 +30,7 @@ typedef enum {
>> 	attr_feature,
>> 	attr_pointer_ui,
>> 	attr_pointer_atomic,
>> +	attr_block_high_watermark,
>> } attr_id_t;
>> 
>> typedef enum {
>> @@ -167,6 +168,7 @@ EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
>> EXT4_ATTR_FUNC(session_write_kbytes, 0444);
>> EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
>> EXT4_ATTR_FUNC(reserved_clusters, 0644);
>> +EXT4_ATTR_FUNC(block_high_watermark, 0600);
>> 
>> EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
>> 		 ext4_sb_info, s_inode_readahead_blks);
>> @@ -217,6 +219,7 @@ static struct attribute *ext4_attrs[] = {
>> 	ATTR_LIST(errors_count),
>> 	ATTR_LIST(first_error_time),
>> 	ATTR_LIST(last_error_time),
>> +	ATTR_LIST(block_high_watermark),
>> 	NULL,
>> };
>> 
>> @@ -304,6 +307,9 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
>> 		return print_tstamp(buf, sbi->s_es, s_first_error_time);
>> 	case attr_last_error_time:
>> 		return print_tstamp(buf, sbi->s_es, s_last_error_time);
>> +	case attr_block_high_watermark:
>> +		return snprintf(buf, PAGE_SIZE, "%llu\n",
>> +				(s64) sbi->s_block_high_watermark);
>> 	}
>> 
>> 	return 0;
>> @@ -318,6 +324,7 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
>> 	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
>> 	void *ptr = calc_ptr(a, sbi);
>> 	unsigned long t;
>> +	unsigned long long t2;
>> 	int ret;
>> 
>> 	switch (a->attr_id) {
>> @@ -338,6 +345,18 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
>> 		return inode_readahead_blks_store(sbi, buf, len);
>> 	case attr_trigger_test_error:
>> 		return trigger_test_error(sbi, buf, len);
>> +	case attr_block_high_watermark:
>> +		if (!ptr)
>> +			return 0;
>> +		ret = kstrtoull(skip_spaces(buf), 0, &t2);
>> +		if (ret)
>> +			return ret;
>> +		if (t2 > ext4_blocks_count(sbi->s_es))
>> +			return -EINVAL;
>> +		if (t2 && t2 < le32_to_cpu(sbi->s_es->s_first_data_block))
>> +			return -EINVAL;
>> +		sbi->s_block_high_watermark = t2;
>> +		return len;
>> 	}
>> 	return 0;
>> }
> 


Cheers, Andreas
Andreas Dilger Nov. 22, 2018, 10:25 p.m. | #3
On Aug 21, 2018, at 8:21 PM, Jaco Kroon <jaco@uls.co.za> wrote:
> 
> NOT READY FOR MERGE!!!!!!
> 
> Limiting block allocations to a high watermark will eventually enable us
> to perform online shrinks of an ext4 filesystem.  As an immediate
> benefit it'll prevent allocation of blocks in the high range, which if
> performed as a precursor to an offline filesystem shrink will help to
> reduce the overall time a filesystem needs to be taken offline in order
> to shrink it.
> 
> (possible) shortcomings:
> 
> Currently this tunable does not get stored to the superblock, and thus
> needs to be set again after each mount.
> 
> The ext4_statfs function doesn't adjust the f_bavail value currently, as
> such df will report incorrect results.
> 
> The inode allocator hasn't been synced yet.

Hi Jaco,
sorry for the extreme delay in replying to this.  It was lost in my inbox
and I only just found it now.  Looking through the patch, it does seem OK
for the basic functionality intended, and would at least allow you to
reduce the number of blocks allocated at the end of the device, meaning
that the offline shrink would take less time (ideally none if all of the
files are removed from the end of the device).

With this first patch it should be possible to do an "online shrink" by
setting the high watermark, then walking the filesystem checking for any
files have blocks beyond the HWM via "filefrag -v" and running e4defrag
on those files.  This should be largely transparent to userspace.  The
current patch would not allow directly limiting inode allocation, but using
the "inode_goal" tunable could be used to influence the inode selection to
allow "mkdir + rsync + mv" to move directory trees to lower inodes.  Only
files currently open for write would not be safe to move to new inodes.


I think for fully using this functionality in the kernel/e2fsprogs a few
more additions are needed, as you mentioned above:
- store the high watermark in the superblock via tune2fs, so that it is
  not lost if the system is rebooted or filesystem remounted
- fix ext4_statfs() to adjust available blocks appropriately
- avoid allocating inodes in blocks above the high watermark

Typically, using tune2fs to adjust a mounted filesystem should change the
value used by the kernel, so also having a /sys tunable gets tricky.  One
option would be to leave "sbi->s_block_high_watermark = 0" and use the
superblock value if the sbi->s_block_high_watermark == 0, and only use
sbi->s_block_high_watermark if it is set directly?  Something like:

static inline
ext4_fsblk_t ext4_blocks_max_allocatable(struct ext4_sb_info *sbi)
{
	ext4_fsblk_t blocks = ext4_blocks_count(sbi->s_es);

	if (unlikely(sbi->s_block_high_watermark &&
		     sbi->s_block_high_watermark < blocks))
		return sbi->s_block_high_watermark;

	if (unlikely(sbi->s_es->s_blk_high_watermark &&
		     le64_to_cpu(sbi->s_es->s_blk_high_watermark) < blocks)
		return le64_to_cpu(sbi->s_es->s_blk_high_watermark);

	return blocks;
}

this adds a bit more runtime overhead vs. setting s_block_high_watermark
from the superblock at mount time, but is more flexible.

For ext4_statfs() do we subtract only the free blocks beyond HWM from the
available count, or all blocks?  Subtracting the difference between
ext4_blocks_count() and ext4_blocks_max_allocatable() is easy (zero if no
high watermark), but the available blocks should not be negative if there
are lots of blocks used beyond the HWM and few free below it.  Better would
be if the available blocks would report the free blocks below the HWM,
but this would involve subtracting free blocks above the HWM and adjusting
this as blocks above the limit are freed.

For the inode allocation limit, it is fairly straight forward to map the
block HWM to an inode HWM based on the group descriptor that the HWM is in.
For future use (dynamic inode tables) it may be desirable to also have a
separately tunable inode HWM, but it could also be done later as needed.

On the e2fsprogs side, there should be a "-E block_high_watermark=N" tunable
added to set the field in the superblock, and support to print it in dumpe2fs
and modify it in dumpe2fs via "ssv".

It may be useful to add a "-f" force flag to e4defrag so that it moves
inodes even if they are not less fragmented afterward, so blocks beyond the
HWM are always freed.  Alternately, block and inode move (for closed files)
might be implemented in userspace via resize2fs (essentially cp+rename) when
it is doing an online shrink of the filesystem?  That might be simpler from
a user point of view instead of needing to run e4defrag manuall that needs
to be scripted to find the files to be moved.

Optionally, should there be a "hard" and a "soft" block limit?  For example,
if the high watermark is set to a negative value -blocks it is a soft limit
(prefer lower allocation, but can exceed it if filesystem is full), or have
a separate "soft" flag stored somewhere else?  In the first case, we should
mask off the high bit when accessing this field, and use it only for deciding
if allocation can continue after a normal scan failed.

In the longer term, the resize ioctl could be enhanced to drop the last
group(s) if they are above the high watermark and have no used blocks/inodes.
The resize2fs tool could report if trying to shrink a filesystem with in-use
blocks that the HWM will be set and file migration is needed, then do the
online migration (reporting any files that are open via lsof) and returning
an error in the end that which processes are blocking the resize.

Some minor nits in the patch inline below:

> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 0f0edd1cd0cd..dc30ea107c55 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1423,6 +1423,7 @@ struct ext4_sb_info {
> 	unsigned int s_mb_order2_reqs;
> 	unsigned int s_mb_group_prealloc;
> 	unsigned int s_max_dir_size_kb;
> +	ext4_fsblk_t s_block_high_watermark; /* allocators must not allocate blocks above this */

(style) should stay under 80 columns.  Easiest to just shorten comment to
something like "/* max allocatable block number */" or similar.

> @@ -2711,6 +2712,15 @@ static inline ext4_fsblk_t ext4_blocks_count(struct +static inline ext4_fsblk_t ext4_blocks_max_allocatable(struct ext4_sb_info *sbi)
> +{
> +	ext4_fsblk_t blocks = ext4_blocks_count(sbi->s_es);

(style) blank line after variable declarations

> +	if (sbi->s_block_high_watermark && sbi->s_block_high_watermark < blocks)
> +		return sbi->s_block_high_watermark;
> +	else
> +		return blocks;

(style) no need for "else" after "return".

> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index 9212a026a1f1..2a1a955c2c0b 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -304,6 +307,9 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
> 		return print_tstamp(buf, sbi->s_es, s_first_error_time);
> 	case attr_last_error_time:
> 		return print_tstamp(buf, sbi->s_es, s_last_error_time);
> +	case attr_block_high_watermark:
> +		return snprintf(buf, PAGE_SIZE, "%llu\n",
> +				(s64) sbi->s_block_high_watermark);

(style) no space after typecast


Cheers, Andreas

Patch

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e5d6ee61ff48..4f723c7a9c88 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -883,7 +883,7 @@  ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
 			block_group++;
 	}
 	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
-	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+	last_block = ext4_blocks_max_allocatable(EXT4_SB(inode->i_sb)) - 1;
 
 	/*
 	 * If we are doing delayed allocation, we don't need take
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0f0edd1cd0cd..dc30ea107c55 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1423,6 +1423,7 @@  struct ext4_sb_info {
 	unsigned int s_mb_order2_reqs;
 	unsigned int s_mb_group_prealloc;
 	unsigned int s_max_dir_size_kb;
+	ext4_fsblk_t s_block_high_watermark; /* allocators must not allocate blocks above this */
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
 	unsigned long s_mb_last_start;
@@ -2711,6 +2712,15 @@  static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 		le32_to_cpu(es->s_blocks_count_lo);
 }
 
+static inline ext4_fsblk_t ext4_blocks_max_allocatable(struct ext4_sb_info *sbi)
+{
+	ext4_fsblk_t blocks = ext4_blocks_count(sbi->s_es);
+	if (sbi->s_block_high_watermark && sbi->s_block_high_watermark < blocks)
+		return sbi->s_block_high_watermark;
+	else
+		return blocks;
+}
+
 static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
 {
 	return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) |
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e29fce2fbf25..a158c2c9de10 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4232,7 +4232,7 @@  ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 	/* start searching from the goal */
 	goal = ar->goal;
 	if (goal < le32_to_cpu(es->s_first_data_block) ||
-			goal >= ext4_blocks_count(es))
+			goal >= ext4_blocks_max_allocatable(sbi))
 		goal = le32_to_cpu(es->s_first_data_block);
 	ext4_get_group_no_and_offset(sb, goal, &group, &block);
 
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 9212a026a1f1..2a1a955c2c0b 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -30,6 +30,7 @@  typedef enum {
 	attr_feature,
 	attr_pointer_ui,
 	attr_pointer_atomic,
+	attr_block_high_watermark,
 } attr_id_t;
 
 typedef enum {
@@ -167,6 +168,7 @@  EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
 EXT4_ATTR_FUNC(session_write_kbytes, 0444);
 EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
 EXT4_ATTR_FUNC(reserved_clusters, 0644);
+EXT4_ATTR_FUNC(block_high_watermark, 0600);
 
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
 		 ext4_sb_info, s_inode_readahead_blks);
@@ -217,6 +219,7 @@  static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(errors_count),
 	ATTR_LIST(first_error_time),
 	ATTR_LIST(last_error_time),
+	ATTR_LIST(block_high_watermark),
 	NULL,
 };
 
@@ -304,6 +307,9 @@  static ssize_t ext4_attr_show(struct kobject *kobj,
 		return print_tstamp(buf, sbi->s_es, s_first_error_time);
 	case attr_last_error_time:
 		return print_tstamp(buf, sbi->s_es, s_last_error_time);
+	case attr_block_high_watermark:
+		return snprintf(buf, PAGE_SIZE, "%llu\n",
+				(s64) sbi->s_block_high_watermark);
 	}
 
 	return 0;
@@ -318,6 +324,7 @@  static ssize_t ext4_attr_store(struct kobject *kobj,
 	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
 	void *ptr = calc_ptr(a, sbi);
 	unsigned long t;
+	unsigned long long t2;
 	int ret;
 
 	switch (a->attr_id) {
@@ -338,6 +345,18 @@  static ssize_t ext4_attr_store(struct kobject *kobj,
 		return inode_readahead_blks_store(sbi, buf, len);
 	case attr_trigger_test_error:
 		return trigger_test_error(sbi, buf, len);
+	case attr_block_high_watermark:
+		if (!ptr)
+			return 0;
+		ret = kstrtoull(skip_spaces(buf), 0, &t2);
+		if (ret)
+			return ret;
+		if (t2 > ext4_blocks_count(sbi->s_es))
+			return -EINVAL;
+		if (t2 && t2 < le32_to_cpu(sbi->s_es->s_first_data_block))
+			return -EINVAL;
+		sbi->s_block_high_watermark = t2;
+		return len;
 	}
 	return 0;
 }