ext4: add an interface to load block bitmaps

Message ID 1526389583-32239-1-git-send-email-wshilong1991@gmail.com
State New
Headers show
Series
  • ext4: add an interface to load block bitmaps
Related show

Commit Message

Wang Shilong May 15, 2018, 1:06 p.m.
From: Wang Shilong <wshilong@ddn.com>

During our benchmarking, we found sometimes writing
performances are not stable enough and there are some
small read during write which could drop throughput(~30%).

  It turned out that block bitmaps loading could make
some latency here,also for a heavy fragmented filesystem,
we might need load many bitmaps to find some free blocks.

  To improve above situation, we had a patch to load block
bitmaps to memory and pin those bitmaps memory until umount
or we release the memory on purpose, this could stable write
performances and improve performances of a heavy fragmented
filesystem.

Tested-by: Shuichi Ihara <sihara@ddn.com>
Signed-off-by: Wang Shilong <wshilong@ddn.com>
---
 fs/ext4/balloc.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/ext4.h   |  12 +++++++
 fs/ext4/super.c  |   3 ++
 fs/ext4/sysfs.c  |  26 ++++++++++++++
 4 files changed, 146 insertions(+)

Comments

Andreas Dilger May 15, 2018, 8:22 p.m. | #1
On May 15, 2018, at 7:06 AM, Wang Shilong <wangshilong1991@gmail.com> wrote:
> 
> From: Wang Shilong <wshilong@ddn.com>
> 
> During our benchmarking, we found sometimes writing
> performances are not stable enough and there are some

s/performances/performance/

> small read during write which could drop throughput(~30%).
> 
>  It turned out that block bitmaps loading could make

s/make/add/

> some latency here,also for a heavy fragmented filesystem,
> we might need load many bitmaps to find some free blocks.
> 
>  To improve above situation, we had a patch to load block

s/had/have/

> bitmaps to memory and pin those bitmaps memory until umount

... in memory ...

> or we release the memory on purpose, this could stable write

... on purpose.  This can stabilize write

> performances and improve performances of a heavy fragmented

s/performances/performance/g   s/heavy/heavily/

> filesystem.
> 
> Tested-by: Shuichi Ihara <sihara@ddn.com>
> Signed-off-by: Wang Shilong <wshilong@ddn.com>
> ---
> fs/ext4/balloc.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> fs/ext4/ext4.h   |  12 +++++++
> fs/ext4/super.c  |   3 ++
> fs/ext4/sysfs.c  |  26 ++++++++++++++
> 4 files changed, 146 insertions(+)
> 
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index b00481c..ceb63e8 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -505,6 +505,8 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
> 					EXT4_GROUP_INFO_BBITMAP_CORRUPT);
> 		return -EIO;
> 	}
> +	/* race is fine */
> +	EXT4_SB(sb)->bbitmaps_read_cnt++;
> 	clear_buffer_new(bh);
> 	/* Panic or remount fs read-only if block bitmap is invalid */
> 	return ext4_validate_block_bitmap(sb, desc, block_group, bh);
> @@ -660,6 +662,109 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle,
> +int ext4_load_block_bitmaps_bh(struct super_block *sb, unsigned int op)

This should take a named enum as parameter instead of "unsigned int".

It would probably be better to integrate this function into the existing
group descriptor handling in ext4_mb_init_backend->ext4_mb_add_groupinfo()
since it is already iterating all of the group descriptors, and it also
handles the case of groups being added by online filesystem resizing.

> +{
> +	struct buffer_head *bitmap_bh;
> +	struct ext4_group_desc *gdp;
> +	ext4_group_t i, j;
> +	ext4_group_t ngroups = ext4_get_groups_count(sb);
> +	ext4_group_t cnt = 0;
> +
> +	if (op < EXT4_LOAD_BBITMAPS || op > EXT4_PIN_BBITMAPS)
> +		return -EINVAL;
> +
> +	mutex_lock(&EXT4_SB(sb)->s_load_bbitmaps_lock);
> +	/* don't pin bitmaps several times */
> +	if (EXT4_SB(sb)->s_load_bbitmaps == EXT4_PIN_BBITMAPS) {
> +		mutex_unlock(&EXT4_SB(sb)->s_load_bbitmaps_lock);
> +		return 0;
> +	}
> +
> +	for (i = 0; i < ngroups; i++) {
> +		gdp = ext4_get_group_desc(sb, i, NULL);
> +		if (!gdp)
> +			continue;
> +		/* Load is simple, we could tolerate any
> +		 * errors and continue to handle, but for
> +		 * pin we return directly for simple handling
> +		 * in unpin codes, otherwiese we need remember

(typo) s/otherwiese/otherwise/ ... we need to remember

> +		 * which block bitmaps we pin exactly.
> +		 */

It should be straightforward to track the pinned bitmaps, by referencing
the buffer head in ext4_group_info with a new field like bb_block_bh.
That would allow the bitmaps to be pinned, but if there is some error
loading one of them it doesn't prevent the filesystem from being mounted.
I don't think that an error pinning the bitmaps should be a mount failure,
since this could happen if the system is short of memory for some reason.

It would be possible to check in ext4_read_block_bitmap() if the current
goal is to pin the bitmap that isn't loaded then a refcount is added to
bb_block_bh.

> +		bitmap_bh = ext4_read_block_bitmap(sb, i);
> +		if (IS_ERR(bitmap_bh)) {
> +			if (op == EXT4_LOAD_BBITMAPS)
> +				continue;
> +			else

(style) no need for "else" after "continue"

> +				goto failed;
> +		}
> +		if (op == EXT4_LOAD_BBITMAPS)
> +			brelse(bitmap_bh);
> +		cnt++;
> +	}
> +	/* Reset block bitmap to zero now */
> +	EXT4_SB(sb)->bbitmaps_read_cnt = 0;

> +	ext4_msg(sb, KERN_INFO, "%s %u block bitmaps finished",
> +		 op == EXT4_PIN_BBITMAPS ? "pin" : "load", cnt);
> +	EXT4_SB(sb)->s_load_bbitmaps = EXT4_PIN_BBITMAPS;
> +	mutex_unlock(&EXT4_SB(sb)->s_load_bbitmaps_lock);
> +
> +	return 0;
> +failed:
> +	for (j = 0; j < i; j++) {
> +		gdp = ext4_get_group_desc(sb, i, NULL);
> +		if (!gdp)
> +			continue;
> +		bitmap_bh = ext4_read_block_bitmap(sb, i);
> +		if (!IS_ERR(bitmap_bh)) {
> +			brelse(bitmap_bh);
> +			brelse(bitmap_bh);
> +		}
> +	}
> +	mutex_unlock(&EXT4_SB(sb)->s_load_bbitmaps_lock);
> +	return PTR_ERR(bitmap_bh);
> +}
> +
> +void ext4_unpin_block_bitmaps_bh(struct super_block *sb)
> +{
> +	struct buffer_head *bitmap_bh;
> +	struct ext4_group_desc *gdp;
> +	ext4_group_t i;
> +	ext4_group_t ngroups = ext4_get_groups_count(sb);
> +	ext4_group_t cnt = 0;
> +
> +	mutex_lock(&EXT4_SB(sb)->s_load_bbitmaps_lock);
> +	if (EXT4_SB(sb)->s_load_bbitmaps == EXT4_UNPIN_BBITMAPS) {
> +		mutex_unlock(&EXT4_SB(sb)->s_load_bbitmaps_lock);
> +		return;
> +	}
> +
> +	ext4_msg(sb, KERN_INFO,
> +		 "Read block block bitmaps: %lu afer %s",

(typo) s/afer/after/

> +		 EXT4_SB(sb)->bbitmaps_read_cnt,
> +		 EXT4_SB(sb)->s_load_bbitmaps == EXT4_PIN_BBITMAPS ?
> +		 "pin" : "load");
> +
> +	if (EXT4_SB(sb)->s_load_bbitmaps != EXT4_PIN_BBITMAPS) {
> +		mutex_unlock(&EXT4_SB(sb)->s_load_bbitmaps_lock);
> +		return;
> +	}
> +
> +	for (i = 0; i < ngroups; i++) {
> +		gdp = ext4_get_group_desc(sb, i, NULL);

This should be changed to:

		struct ext4_group_info *grinfo = ext4_get_group_info(sb, i);

and then only brelse the bufferhead if it was pinned properly:

		if (!IS_ERR(grinfo->bb_block_bh))
			brelse(grinfo->bb_block_bh);
		grinfo->bb_block_bh = NULL;

In theory we could also optimize ext4_read_block_bitmap() to add a
reference to bb_block_bh directly instead of the extra work done there.

> +		if (!gdp)
> +			continue;
> +		bitmap_bh = ext4_read_block_bitmap(sb, i);
> +		if (IS_ERR(bitmap_bh))
> +			continue;
> +		brelse(bitmap_bh);
> +		brelse(bitmap_bh);
> +		cnt++;
> +	}
> +	ext4_msg(sb, KERN_INFO, "Unpin %u lock bitmaps finished", cnt);
> +	EXT4_SB(sb)->s_load_bbitmaps = EXT4_UNPIN_BBITMAPS;
> +	mutex_unlock(&EXT4_SB(sb)->s_load_bbitmaps_lock);
> +}
> +
> /**
>  * ext4_count_free_clusters() -- count filesystem free clusters
>  * @sb:		superblock
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index fa52b7d..4f9ee73 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1317,6 +1317,12 @@ struct ext4_super_block {
> /* Number of quota types we support */
> #define EXT4_MAXQUOTAS 3
> 
> +enum {

This should be a named enum, like ext4_bitmap_state.

> +	EXT4_UNPIN_BBITMAPS = 0,
> +	EXT4_LOAD_BBITMAPS,
> +	EXT4_PIN_BBITMAPS,

Since these values are exported to userspace, they should have specific
values assigned.  Also, we should consider the case of being able to
load/pin inode bitmaps.  Something like:

enum ext4_bitmap_state {
	EXT4_UNPIN_BITMAPS	= 0x00,
	EXT4_LOAD_BBITMAPS	= 0x01,
	EXT4_PIN_BBITMAPS	= 0x02,
	EXT4_LOAD_IBITMAPS	= 0x04,
	EXT4_PIN_IBITMAPS	= 0x08,
};

> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index 9ebd26c..89396b3f 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> 
> +static ssize_t load_bbitmaps_store(struct ext4_sb_info *sbi,
> +				   const char *buf, size_t count)

I'd suggest to take "b" (block) out of the name here so that this
can also be used for inode bitmaps in the future.

> +{
> +	unsigned long long val;
> +	int ret;
> +
> +	ret = kstrtoull(skip_spaces(buf), 0, &val);
> +	if (ret || val > EXT4_PIN_BBITMAPS)
> +		return -EINVAL;
> +
> +	if (val == EXT4_UNPIN_BBITMAPS)
> +		ext4_unpin_block_bitmaps_bh(sbi->s_sb);
> +	else if (val > EXT4_UNPIN_BBITMAPS)
> +		ret = ext4_load_block_bitmaps_bh(sbi->s_sb, val);

This is not a great userspace interface, since the EXT4_*_BBITMAPS
values don't have specific values.  It would be nicer to accept
string names like "block" or "block-pin" or "inode" or "inode-pin",
and "unpin", but at least if there are fixed numbers (something like
"echo 6 > /sys/fs/ext4/sda/load_bitmaps" to load both bitmaps and
also pin the block bitmaps) the userspace interface won't change.

> +	return ret ? ret : count;
> +}
> 
> @@ -270,6 +291,9 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
> 		return snprintf(buf, PAGE_SIZE, "%llu\n",
> 				(unsigned long long)
> 				atomic64_read(&sbi->s_resv_clusters));
> +	case attr_load_bbitmaps:
> +		return snprintf(buf, PAGE_SIZE, "%u\n",
> +				sbi->s_load_bbitmaps);

This should use "%#x" so that we know which flags are set.

Cheers, Andreas
Theodore Y. Ts'o May 16, 2018, 2:58 a.m. | #2
On Tue, May 15, 2018 at 10:06:23PM +0900, Wang Shilong wrote:
> From: Wang Shilong <wshilong@ddn.com>
> 
> During our benchmarking, we found sometimes writing
> performances are not stable enough and there are some
> small read during write which could drop throughput(~30%).

Out of curiosity, what sort of benchmarks are you doing? 

>   It turned out that block bitmaps loading could make
> some latency here,also for a heavy fragmented filesystem,
> we might need load many bitmaps to find some free blocks.
> 
>   To improve above situation, we had a patch to load block
> bitmaps to memory and pin those bitmaps memory until umount
> or we release the memory on purpose, this could stable write
> performances and improve performances of a heavy fragmented
> filesystem.

This is true, but I wonder how realistic this is on real production
systems.  For a 1 TiB file system, pinning all of the block bitmaps
will require 32 megabytes of memory.  Is that really realistic for
your use case?

So is this something just for benchmarking (in which case what are you
trying to benchmark)?  Or is this something that you want to use in
production?  And if so, perhaps something to consider is analyzing how
fragmented and how full you want to run your file system.

Something to perhaps consider doing is storing the bitmap in memory in
a compressed form.  For example, you could use a run length encoding
scheme where 2 bytes is used to encoding the starting block of a free
extent, and 2 bytes to encode the length of the free extent.  For a
large number of mostly full (or mostly empty, for that matter) block
allocation bitmaps, this will be a much more efficient way to cache
the information in memory if you really want to keep all of the
allocation information in memory.

Something else to investigate is *why* is the file system getting so
fragmented in the first place, and are there things we can do to
prevent the file system from getting that fragmented in the first
place....

						- Ted
Andreas Dilger May 16, 2018, 6:12 p.m. | #3
On May 15, 2018, at 8:58 PM, Theodore Y. Ts'o <tytso@mit.edu> wrote:
> 
> On Tue, May 15, 2018 at 10:06:23PM +0900, Wang Shilong wrote:
>> From: Wang Shilong <wshilong@ddn.com>
>> 
>> During our benchmarking, we found sometimes writing
>> performances are not stable enough and there are some
>> small read during write which could drop throughput(~30%).
> 
> Out of curiosity, what sort of benchmarks are you doing?

This is related to Lustre, though I can't comment on the specific
benchmarks.  We've had a variety of reports about this issue. The
current workaround (hack) is "dumpe2fs /dev/XXX > /dev/null" at
mount time and periodically at runtime via cron to keep the bitmaps
loaded, but we wanted to get a better solution in place that works
for everyone.  Being able to load all of the bitmaps at mount time
also helps get the server performance "up to speed" quickly, rather
than on-demand loading of a few hundred MB of bitmaps over time.

>>  It turned out that block bitmaps loading could make
>> some latency here,also for a heavy fragmented filesystem,
>> we might need load many bitmaps to find some free blocks.
>> 
>>  To improve above situation, we had a patch to load block
>> bitmaps to memory and pin those bitmaps memory until umount
>> or we release the memory on purpose, this could stable write
>> performances and improve performances of a heavy fragmented
>> filesystem.
> 
> This is true, but I wonder how realistic this is on real production
> systems.  For a 1 TiB file system, pinning all of the block bitmaps
> will require 32 megabytes of memory.  Is that really realistic for
> your use case?

In the case of Lustre servers, they typically have 128GB of RAM or
more, and are serving a few hundred TB of storage each these days,
but do not have any other local users, so the ~6GB RAM usage isn't a
huge problem if it improves the performance/consistency. The real
issue is that the bitmaps do not get referenced often enough compared
to the 10GB/s of data flowing through the server, so they are pushed
out of memory too quickly.

> So is this something just for benchmarking (in which case what are you
> trying to benchmark)?  Or is this something that you want to use in
> production?  And if so, perhaps something to consider is analyzing how
> fragmented and how full you want to run your file system.

Fullness and fragmentation is up to the end user, and not really under
our control.  I think filesystems are typically always 75-80% full, and
when they pass 90% there is a "purge" of old files to get it back under
the 75% range.

> Something else to investigate is *why* is the file system getting so
> fragmented in the first place, and are there things we can do to
> prevent the file system from getting that fragmented in the first
> place....

I think it is just filesystem aging, and some users aren't good at keeping
the space usage below 90-95%, but then are unhappy when the performance
goes down afterward.

> Something to perhaps consider doing is storing the bitmap in memory in
> a compressed form.  For example, you could use a run length encoding
> scheme where 2 bytes is used to encoding the starting block of a free
> extent, and 2 bytes to encode the length of the free extent.  For a
> large number of mostly full (or mostly empty, for that matter) block
> allocation bitmaps, this will be a much more efficient way to cache
> the information in memory if you really want to keep all of the
> allocation information in memory.

I was thinking of something like this for the inode bitmaps, especially
since they are often going to be mostly unused (default is 1MB/inode for
Lustre OSTs, so only 16/4096 bytes used in each inode bitmap).  Another
option is to store only in-use bytes instead of the full inode bitmap.

Cheers, Andreas
Jan Kara May 23, 2018, 3:30 p.m. | #4
On Wed 16-05-18 12:12:00, Andreas Dilger wrote:
> On May 15, 2018, at 8:58 PM, Theodore Y. Ts'o <tytso@mit.edu> wrote:
> > 
> > On Tue, May 15, 2018 at 10:06:23PM +0900, Wang Shilong wrote:
> >> From: Wang Shilong <wshilong@ddn.com>
> >> 
> >> During our benchmarking, we found sometimes writing
> >> performances are not stable enough and there are some
> >> small read during write which could drop throughput(~30%).
> > 
> > Out of curiosity, what sort of benchmarks are you doing?
> 
> This is related to Lustre, though I can't comment on the specific
> benchmarks.  We've had a variety of reports about this issue. The
> current workaround (hack) is "dumpe2fs /dev/XXX > /dev/null" at
> mount time and periodically at runtime via cron to keep the bitmaps
> loaded, but we wanted to get a better solution in place that works
> for everyone.  Being able to load all of the bitmaps at mount time
> also helps get the server performance "up to speed" quickly, rather
> than on-demand loading of a few hundred MB of bitmaps over time.

So what I don't like on the explicit pinning is that only a few admins will
be able to get it right.

> >>  It turned out that block bitmaps loading could make
> >> some latency here,also for a heavy fragmented filesystem,
> >> we might need load many bitmaps to find some free blocks.
> >> 
> >>  To improve above situation, we had a patch to load block
> >> bitmaps to memory and pin those bitmaps memory until umount
> >> or we release the memory on purpose, this could stable write
> >> performances and improve performances of a heavy fragmented
> >> filesystem.
> > 
> > This is true, but I wonder how realistic this is on real production
> > systems.  For a 1 TiB file system, pinning all of the block bitmaps
> > will require 32 megabytes of memory.  Is that really realistic for
> > your use case?
> 
> In the case of Lustre servers, they typically have 128GB of RAM or
> more, and are serving a few hundred TB of storage each these days,
> but do not have any other local users, so the ~6GB RAM usage isn't a
> huge problem if it improves the performance/consistency. The real
> issue is that the bitmaps do not get referenced often enough compared
> to the 10GB/s of data flowing through the server, so they are pushed
> out of memory too quickly.

OK, and that 10GB/s is mostly use once data?

So I could imagine we cache free block information in a more efficient format
(something like you or Ted describe), provide a proper shrinker for it
(possibly biasing it to make reclaim less likely), and enable it always...
That way users don't have to configure it and we don't have to be afraid of
eating too much memory in expense of something else.

								Honza
Andreas Dilger May 23, 2018, 5:12 p.m. | #5
On May 23, 2018, at 9:30 AM, Jan Kara <jack@suse.cz> wrote:
> 
> On Wed 16-05-18 12:12:00, Andreas Dilger wrote:
>> On May 15, 2018, at 8:58 PM, Theodore Y. Ts'o <tytso@mit.edu> wrote:
>>> 
>>> On Tue, May 15, 2018 at 10:06:23PM +0900, Wang Shilong wrote:
>>>> From: Wang Shilong <wshilong@ddn.com>
>>>> 
>>>> During our benchmarking, we found sometimes writing
>>>> performances are not stable enough and there are some
>>>> small read during write which could drop throughput(~30%).
>>> 
>>> Out of curiosity, what sort of benchmarks are you doing?
>> 
>> This is related to Lustre, though I can't comment on the specific
>> benchmarks.  We've had a variety of reports about this issue. The
>> current workaround (hack) is "dumpe2fs /dev/XXX > /dev/null" at
>> mount time and periodically at runtime via cron to keep the bitmaps
>> loaded, but we wanted to get a better solution in place that works
>> for everyone.  Being able to load all of the bitmaps at mount time
>> also helps get the server performance "up to speed" quickly, rather
>> than on-demand loading of a few hundred MB of bitmaps over time.
> 
> So what I don't like on the explicit pinning is that only a few admins will
> be able to get it right.

Sure, I agree it would be better to handle this more automatically.

>>>> It turned out that block bitmaps loading could make
>>>> some latency here,also for a heavy fragmented filesystem,
>>>> we might need load many bitmaps to find some free blocks.
>>>> 
>>>> To improve above situation, we had a patch to load block
>>>> bitmaps to memory and pin those bitmaps memory until umount
>>>> or we release the memory on purpose, this could stable write
>>>> performances and improve performances of a heavy fragmented
>>>> filesystem.
>>> 
>>> This is true, but I wonder how realistic this is on real production
>>> systems.  For a 1 TiB file system, pinning all of the block bitmaps
>>> will require 32 megabytes of memory.  Is that really realistic for
>>> your use case?
>> 
>> In the case of Lustre servers, they typically have 128GB of RAM or
>> more, and are serving a few hundred TB of storage each these days,
>> but do not have any other local users, so the ~6GB RAM usage isn't a
>> huge problem if it improves the performance/consistency. The real
>> issue is that the bitmaps do not get referenced often enough compared
>> to the 10GB/s of data flowing through the server, so they are pushed
>> out of memory too quickly.
> 
> OK, and that 10GB/s is mostly use once data?

Pretty much, yes.  There isn't much chance to re-use the data when it
can only fit into RAM for a few seconds.

> So I could imagine we cache free block information in a more efficient format
> (something like you or Ted describe), provide a proper shrinker for it
> (possibly biasing it to make reclaim less likely), and enable it always...
> That way users don't have to configure it and we don't have to be afraid of
> eating too much memory in expense of something else.

One of the problems with this approach is that having compressed bitmaps
wouldn't necessarily help the performance issue.  This would allow the
initial block allocation to proceed without reading the bitmap from disk,
but then the bitmap still needs to be updated and written to disk in that
transaction.

I guess one possibility is to reconstruct a full bitmap from the compressed
in-memory bitmap for the write, but this also carries some risk if there
is an error - the reconstructed bitmap is much more likely to have a large
corruption because it is generated from a compressed version, compared to
a (likely) small corruption in the full bitmap.

Of course, the other question is the complexity of implementing this.
Pinning the bitmaps is a trivial change that can be applied to a wide range
of kernel versions, while adding compressed bitmaps will add a lot more
code and complexity.  I'm not against that, but it would take longer to do.

Cheers, Andreas
Jan Kara May 24, 2018, 8:42 a.m. | #6
On Wed 23-05-18 11:12:29, Andreas Dilger wrote:
> On May 23, 2018, at 9:30 AM, Jan Kara <jack@suse.cz> wrote:
> >>>> It turned out that block bitmaps loading could make
> >>>> some latency here,also for a heavy fragmented filesystem,
> >>>> we might need load many bitmaps to find some free blocks.
> >>>> 
> >>>> To improve above situation, we had a patch to load block
> >>>> bitmaps to memory and pin those bitmaps memory until umount
> >>>> or we release the memory on purpose, this could stable write
> >>>> performances and improve performances of a heavy fragmented
> >>>> filesystem.
> >>> 
> >>> This is true, but I wonder how realistic this is on real production
> >>> systems.  For a 1 TiB file system, pinning all of the block bitmaps
> >>> will require 32 megabytes of memory.  Is that really realistic for
> >>> your use case?
> >> 
> >> In the case of Lustre servers, they typically have 128GB of RAM or
> >> more, and are serving a few hundred TB of storage each these days,
> >> but do not have any other local users, so the ~6GB RAM usage isn't a
> >> huge problem if it improves the performance/consistency. The real
> >> issue is that the bitmaps do not get referenced often enough compared
> >> to the 10GB/s of data flowing through the server, so they are pushed
> >> out of memory too quickly.
> > 
> > OK, and that 10GB/s is mostly use once data?
> 
> Pretty much, yes.  There isn't much chance to re-use the data when it
> can only fit into RAM for a few seconds.

Fair enough.

> > So I could imagine we cache free block information in a more efficient format
> > (something like you or Ted describe), provide a proper shrinker for it
> > (possibly biasing it to make reclaim less likely), and enable it always...
> > That way users don't have to configure it and we don't have to be afraid of
> > eating too much memory in expense of something else.
> 
> One of the problems with this approach is that having compressed bitmaps
> wouldn't necessarily help the performance issue.  This would allow the
> initial block allocation to proceed without reading the bitmap from disk,
> but then the bitmap still needs to be updated and written to disk in that
> transaction.
>
> I guess one possibility is to reconstruct a full bitmap from the compressed
> in-memory bitmap for the write, but this also carries some risk if there
> is an error - the reconstructed bitmap is much more likely to have a large
> corruption because it is generated from a compressed version, compared to
> a (likely) small corruption in the full bitmap.

Yes, generating the bitmap buffer (so that it can be modified & journalled)
from the compressed data is what I had in mind. But note that even without
that at least the initial search for free blocks would not have to load all
the bitmaps for groups we find are not suitable for the allocation in the
end. I'm not sure how big help would it be for your workload but for a
fragmented filesystem I expect it would be actually a big win.

I agree that generating the bitmap buffer from the compressed in-memory
data sounds a bit scary but I'd be more afraid of programming bugs than
plain memory corruption. So at least initially we could have a paranoia
mode where we load the bitmap we are going to modify from disk and make
sure it's consistent with the compressed version.

> Of course, the other question is the complexity of implementing this.
> Pinning the bitmaps is a trivial change that can be applied to a wide range
> of kernel versions, while adding compressed bitmaps will add a lot more
> code and complexity.  I'm not against that, but it would take longer to do.

Yes, it's more complexity but pinning is OTOH user visible API so that has
some maintenance overhead as well as we have to maintain it indefinitely.
And your hack with dumpe2fs works good enough (as much as it's ugly) that I
don't think we need to hurry some half baked solutions...

								Honza
Theodore Y. Ts'o May 24, 2018, 5 p.m. | #7
On Thu, May 24, 2018 at 10:42:43AM +0200, Jan Kara wrote:
> 
> Yes, it's more complexity but pinning is OTOH user visible API so that has
> some maintenance overhead as well as we have to maintain it indefinitely.
> And your hack with dumpe2fs works good enough (as much as hit's ugly) that I
> don't think we need to hurry some half baked solutions...

If the user-visible API is a mount option, it's really not that hard
deprecate it later in favor of something which is done automatically
via some kind of compressed in-memory pinning mechanism --- we would
just turn the mount option into a no-op plus a print-once warning
message saying it's no longer needed.

Jan's design makes much more sense as the long-term direction --- and
if the good folks at DDN are willing to code it, I would be greatly
pleased.  That being said, if they only want the short-term hack, my
primary concern is code maintenance and complexity issue.  If there is
a simple for loop that calls the existing functions to load the block
bitmap, and then pin them by bumping the ref count, I'm not going to
object that much.  What I didn't like about the inital patch was the
large number of new code that duplicates existing functionality, since
that's a code maintenance burden.

I will say that if we use an rbtree of free block extents, we could
probably use this instead of the buddy bitmaps, and the resulting code
could probably do a better job finding free blocks and reducing
fragmentation.  So that's another reason to go down the patch
suggested by Jan.

Regards,

					- Ted

Patch

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index b00481c..ceb63e8 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -505,6 +505,8 @@  int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
 					EXT4_GROUP_INFO_BBITMAP_CORRUPT);
 		return -EIO;
 	}
+	/* race is fine */
+	EXT4_SB(sb)->bbitmaps_read_cnt++;
 	clear_buffer_new(bh);
 	/* Panic or remount fs read-only if block bitmap is invalid */
 	return ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -660,6 +662,109 @@  ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 	return ret;
 }
 
+int ext4_load_block_bitmaps_bh(struct super_block *sb, unsigned int op)
+{
+	struct buffer_head *bitmap_bh;
+	struct ext4_group_desc *gdp;
+	ext4_group_t i, j;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	ext4_group_t cnt = 0;
+
+	if (op < EXT4_LOAD_BBITMAPS || op > EXT4_PIN_BBITMAPS)
+		return -EINVAL;
+
+	mutex_lock(&EXT4_SB(sb)->s_load_bbitmaps_lock);
+	/* don't pin bitmaps several times */
+	if (EXT4_SB(sb)->s_load_bbitmaps == EXT4_PIN_BBITMAPS) {
+		mutex_unlock(&EXT4_SB(sb)->s_load_bbitmaps_lock);
+		return 0;
+	}
+
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		/* Load is simple, we could tolerate any
+		 * errors and continue to handle, but for
+		 * pin we return directly for simple handling
+		 * in unpin codes, otherwiese we need remember
+		 * which block bitmaps we pin exactly.
+		 */
+		bitmap_bh = ext4_read_block_bitmap(sb, i);
+		if (IS_ERR(bitmap_bh)) {
+			if (op == EXT4_LOAD_BBITMAPS)
+				continue;
+			else
+				goto failed;
+		}
+		if (op == EXT4_LOAD_BBITMAPS)
+			brelse(bitmap_bh);
+		cnt++;
+	}
+	/* Reset block bitmap to zero now */
+	EXT4_SB(sb)->bbitmaps_read_cnt = 0;
+	ext4_msg(sb, KERN_INFO, "%s %u block bitmaps finished",
+		 op == EXT4_PIN_BBITMAPS ? "pin" : "load", cnt);
+	EXT4_SB(sb)->s_load_bbitmaps = EXT4_PIN_BBITMAPS;
+	mutex_unlock(&EXT4_SB(sb)->s_load_bbitmaps_lock);
+
+	return 0;
+failed:
+	for (j = 0; j < i; j++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		bitmap_bh = ext4_read_block_bitmap(sb, i);
+		if (!IS_ERR(bitmap_bh)) {
+			brelse(bitmap_bh);
+			brelse(bitmap_bh);
+		}
+	}
+	mutex_unlock(&EXT4_SB(sb)->s_load_bbitmaps_lock);
+	return PTR_ERR(bitmap_bh);
+}
+
+void ext4_unpin_block_bitmaps_bh(struct super_block *sb)
+{
+	struct buffer_head *bitmap_bh;
+	struct ext4_group_desc *gdp;
+	ext4_group_t i;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	ext4_group_t cnt = 0;
+
+	mutex_lock(&EXT4_SB(sb)->s_load_bbitmaps_lock);
+	if (EXT4_SB(sb)->s_load_bbitmaps == EXT4_UNPIN_BBITMAPS) {
+		mutex_unlock(&EXT4_SB(sb)->s_load_bbitmaps_lock);
+		return;
+	}
+
+	ext4_msg(sb, KERN_INFO,
+		 "Read block block bitmaps: %lu afer %s",
+		 EXT4_SB(sb)->bbitmaps_read_cnt,
+		 EXT4_SB(sb)->s_load_bbitmaps == EXT4_PIN_BBITMAPS ?
+		 "pin" : "load");
+
+	if (EXT4_SB(sb)->s_load_bbitmaps != EXT4_PIN_BBITMAPS) {
+		mutex_unlock(&EXT4_SB(sb)->s_load_bbitmaps_lock);
+		return;
+	}
+
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		bitmap_bh = ext4_read_block_bitmap(sb, i);
+		if (IS_ERR(bitmap_bh))
+			continue;
+		brelse(bitmap_bh);
+		brelse(bitmap_bh);
+		cnt++;
+	}
+	ext4_msg(sb, KERN_INFO, "Unpin %u lock bitmaps finished", cnt);
+	EXT4_SB(sb)->s_load_bbitmaps = EXT4_UNPIN_BBITMAPS;
+	mutex_unlock(&EXT4_SB(sb)->s_load_bbitmaps_lock);
+}
+
 /**
  * ext4_count_free_clusters() -- count filesystem free clusters
  * @sb:		superblock
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index fa52b7d..4f9ee73 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1317,6 +1317,12 @@  struct ext4_super_block {
 /* Number of quota types we support */
 #define EXT4_MAXQUOTAS 3
 
+enum {
+	EXT4_UNPIN_BBITMAPS = 0,
+	EXT4_LOAD_BBITMAPS,
+	EXT4_PIN_BBITMAPS,
+};
+
 /*
  * fourth extended-fs super-block data in memory
  */
@@ -1487,6 +1493,10 @@  struct ext4_sb_info {
 	/* Barrier between changing inodes' journal flags and writepages ops. */
 	struct percpu_rw_semaphore s_journal_flag_rwsem;
 	struct dax_device *s_daxdev;
+
+	struct mutex s_load_bbitmaps_lock;
+	unsigned long bbitmaps_read_cnt;
+	unsigned int s_load_bbitmaps;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -2224,6 +2234,8 @@  int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
 				  struct buffer_head *bh);
 
 /* balloc.c */
+int ext4_load_block_bitmaps_bh(struct super_block *sb, unsigned int op);
+void ext4_unpin_block_bitmaps_bh(struct super_block *sb);
 extern void ext4_get_group_no_and_offset(struct super_block *sb,
 					 ext4_fsblk_t blocknr,
 					 ext4_group_t *blockgrpp,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1388e56..b3e896f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -902,6 +902,7 @@  static void ext4_put_super(struct super_block *sb)
 	int aborted = 0;
 	int i, err;
 
+	ext4_unpin_block_bitmaps_bh(sb);
 	ext4_unregister_li_request(sb);
 	ext4_quota_off_umount(sb);
 
@@ -4393,6 +4394,8 @@  static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
 	ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
 
+	mutex_init(&EXT4_SB(sb)->s_load_bbitmaps_lock);
+
 	kfree(orig_data);
 	return 0;
 
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 9ebd26c..89396b3f 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -23,6 +23,7 @@ 
 	attr_session_write_kbytes,
 	attr_lifetime_write_kbytes,
 	attr_reserved_clusters,
+	attr_load_bbitmaps,
 	attr_inode_readahead,
 	attr_trigger_test_error,
 	attr_feature,
@@ -105,6 +106,24 @@  static ssize_t reserved_clusters_store(struct ext4_sb_info *sbi,
 	return count;
 }
 
+static ssize_t load_bbitmaps_store(struct ext4_sb_info *sbi,
+				   const char *buf, size_t count)
+{
+	unsigned long long val;
+	int ret;
+
+	ret = kstrtoull(skip_spaces(buf), 0, &val);
+	if (ret || val > EXT4_PIN_BBITMAPS)
+		return -EINVAL;
+
+	if (val == EXT4_UNPIN_BBITMAPS)
+		ext4_unpin_block_bitmaps_bh(sbi->s_sb);
+	else if (val > EXT4_UNPIN_BBITMAPS)
+		ret = ext4_load_block_bitmaps_bh(sbi->s_sb, val);
+
+	return ret ? ret : count;
+}
+
 static ssize_t trigger_test_error(struct ext4_sb_info *sbi,
 				  const char *buf, size_t count)
 {
@@ -163,6 +182,7 @@  static ssize_t trigger_test_error(struct ext4_sb_info *sbi,
 EXT4_ATTR_FUNC(session_write_kbytes, 0444);
 EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
 EXT4_ATTR_FUNC(reserved_clusters, 0644);
+EXT4_ATTR_FUNC(load_bbitmaps, 0644);
 
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
 		 ext4_sb_info, s_inode_readahead_blks);
@@ -193,6 +213,7 @@  static ssize_t trigger_test_error(struct ext4_sb_info *sbi,
 	ATTR_LIST(session_write_kbytes),
 	ATTR_LIST(lifetime_write_kbytes),
 	ATTR_LIST(reserved_clusters),
+	ATTR_LIST(load_bbitmaps),
 	ATTR_LIST(inode_readahead_blks),
 	ATTR_LIST(inode_goal),
 	ATTR_LIST(mb_stats),
@@ -270,6 +291,9 @@  static ssize_t ext4_attr_show(struct kobject *kobj,
 		return snprintf(buf, PAGE_SIZE, "%llu\n",
 				(unsigned long long)
 				atomic64_read(&sbi->s_resv_clusters));
+	case attr_load_bbitmaps:
+		return snprintf(buf, PAGE_SIZE, "%u\n",
+				sbi->s_load_bbitmaps);
 	case attr_inode_readahead:
 	case attr_pointer_ui:
 		if (!ptr)
@@ -302,6 +326,8 @@  static ssize_t ext4_attr_store(struct kobject *kobj,
 	switch (a->attr_id) {
 	case attr_reserved_clusters:
 		return reserved_clusters_store(sbi, buf, len);
+	case attr_load_bbitmaps:
+		return load_bbitmaps_store(sbi, buf, len);
 	case attr_pointer_ui:
 		if (!ptr)
 			return 0;