diff mbox series

[RFC] improve mballoc for large filesystems: prefetch bitmaps

Message ID E4874E78-3D87-46A3-A3B3-3128ED8A2ED2@whamcloud.com
State New
Headers show
Series [RFC] improve mballoc for large filesystems: prefetch bitmaps | expand

Commit Message

Alex Zhuravlev Dec. 2, 2019, 9:08 a.m. UTC
Hi,

Here is another patch for prefetching, reworked a bit:
- flex_bg is taken into account as few bitmaps are supposed to be fetched with a single IO
- limit number of prefetches at cr=0 so that mballoc doesn’t try to load all bitmaps

Thanks, Alex

Comments

Andreas Dilger Dec. 2, 2019, 9:46 p.m. UTC | #1
On Dec 2, 2019, at 2:08 AM, Alex Zhuravlev <azhuravlev@whamcloud.com> wrote:
> 
> Hi,
> 
> Here is another patch for prefetching, reworked a bit:
> - flex_bg is taken into account as few bitmaps are supposed to be fetched with a single IO
> - limit number of prefetches at cr=0 so that mballoc doesn’t try to load all bitmaps
> 
> Thanks, Alex
> 
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index 0b202e00d93f..0bc694c5dcfe 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -404,7 +404,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
>  * Return buffer_head on success or NULL in case of failure.
>  */
> struct buffer_head *
> -ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
> +ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
> +				 int ignore_locked)

(style) should be "bool" I think

> @@ -524,7 +532,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
> 	struct buffer_head *bh;
> 	int err;
> 
> -	bh = ext4_read_block_bitmap_nowait(sb, block_group);
> +	bh = ext4_read_block_bitmap_nowait(sb, block_group, 1);

,,, which means this should be "true"

> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index f8578caba40d..4a7f4ccd8641 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1485,6 +1485,8 @@ struct ext4_sb_info {
> 	/* where last allocation was done - for stream allocation */
> 	unsigned long s_mb_last_group;
> 	unsigned long s_mb_last_start;
> +	unsigned int s_mb_prefetch;
> +	unsigned int s_mb_prefetch_limit;

(style) please add brief comments for these fields

> @@ -2339,7 +2341,8 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
> extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
> 
> extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
> -						ext4_group_t block_group);
> +						ext4_group_t block_group,
> +						int ignore_locked);

(style) bool

> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 7c6c34fd8e1c..e4d93c9a6b77 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -861,7 +861,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
> 			bh[i] = NULL;
> 			continue;
> 		}
> -		bh[i] = ext4_read_block_bitmap_nowait(sb, group);
> +		bh[i] = ext4_read_block_bitmap_nowait(sb, group, 0);

... should then be false

> +/*
> + * each allocation context (i.e. a thread doing allocation) has own
> + * sliding prefetch window of @s_mb_prefetch size which starts at the
> + * very first goal and moves ahead of scaning.
> + * a side effect is that subsequent allocations will likely find
> + * the bitmaps in cache or at least in-flight.
> + */
> +static void

(style) return type should be on the same line as the function declaration
unless it can't fit, which isn't the case here

> +ext4_mb_prefetch(struct ext4_allocation_context *ac,
> +		    ext4_group_t start)

(style) align after '(' on previous line

> +{
> +	struct super_block *sb = ac->ac_sb;
> +	ext4_group_t ngroups = ext4_get_groups_count(sb);
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	struct ext4_group_info *grp;
> +	ext4_group_t group = start;
> +	struct buffer_head *bh;
> +	int nr;
> +
> +	/* limit prefetching at cr=0, otherwise mballoc can
> +	 * spend a lot of time loading imperfect groups */
> +	if (!ac->ac_criteria && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
> +		return;
> +
> +	/* batch prefetching to get few READs in flight */
> +	nr = ac->ac_prefetch - group;
> +	if (ac->ac_prefetch < group)
> +		/* wrapped to the first groups */
> +		nr += ngroups;
> +	if (nr > 0)
> +		return;

This is a bit non-obvious.  Clearly, if ac_prefetch < group implies that
"nr" is negative, and adding ngroups will make it positive again, but this
would be more clear for the reader if written as:

	if (nr < 0)	/* ac_prefetch wrapped to the first groups */
		nr += ngroups;

> +	BUG_ON(nr < 0);
> +
> +	nr = sbi->s_mb_prefetch;
> +	if (ext4_has_feature_flex_bg(ac->ac_sb)) {
> +		/* align to flex_bg to get more bitmas with a single IO */
> +		nr = (group / sbi->s_mb_prefetch) * sbi->s_mb_prefetch;

This is more commonly written today as, since s_mb_prefetch is not always
a power-of-two value:

		nr = roundup(group, sbi->s_mb_prefetch);

> +		nr = nr + sbi->s_mb_prefetch - group;
> +	}
> +	while (nr-- > 0) {
> +		grp = ext4_get_group_info(sb, group);
> +		/* ignore empty groups - those will be skipped
> +		 * during the scanning as well */
> +		if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {

Should this also skip groups that we know will not be accepted in this pass
(e.g. if cr < 2 and bb_free < number of blocks being allocated), and leave
fetching the lousy groups to a later pass if they are actually needed?

There is a bit of a balance between fetching all groups actually being
faster due to contiguous reads, vs. skipping groups that are not useful...

> +			bh = ext4_read_block_bitmap_nowait(sb, group, 1);
> +			if (bh && !IS_ERR(bh)) {
> +				if (!buffer_uptodate(bh))
> +					ac->ac_prefetch_ios++;
> +				brelse(bh);
> +			}
> +		}
> +		if (++group >= ngroups)
> +			group = 0;
> +	}
> +	ac->ac_prefetch = group;
> +}
> +
> +static void
> +ext4_mb_prefetch_fini(struct ext4_allocation_context *ac)
> +{
> +	struct ext4_group_info *grp;
> +	ext4_group_t group;
> +	int nr, rc;
> +
> +	/* initialize last window of prefetched groups */

This should probably be a function block comment, maybe s/last/most recent/?

> +	nr = ac->ac_prefetch_ios;
> +	if (nr > EXT4_SB(ac->ac_sb)->s_mb_prefetch)
> +		nr = EXT4_SB(ac->ac_sb)->s_mb_prefetch;
> +	group = ac->ac_prefetch;
> +	while (nr-- > 0) {
> +		grp = ext4_get_group_info(ac->ac_sb, group);
> +		if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
> +			rc = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
> +			if (rc)
> +				break;
> +		}
> +		if (group-- == 0)
> +			group = ext4_get_groups_count(ac->ac_sb) - 1;
> +	}
> +}
> +
> static noinline_for_stack int
> ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> {
> @@ -2175,7 +2256,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> 		 * searching for the right group start
> 		 * from the goal value specified
> 		 */
> -		group = ac->ac_g_ex.fe_group;
> +		group = ac->ac_g_ex.fe_group + 1;
> +		ac->ac_prefetch = group;
> 
> 		for (i = 0; i < ngroups; group++, i++) {
> 			int ret = 0;
> @@ -2187,6 +2269,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> 			if (group >= ngroups)
> 				group = 0;
> 
> +			ext4_mb_prefetch(ac, group);
> +
> 			/* This now checks without needing the buddy page */
> 			ret = ext4_mb_good_group(ac, group, cr);
> 			if (ret <= 0) {
> @@ -2259,6 +2343,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> out:
> 	if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
> 		err = first_err;
> +	/* use prefetched bitmaps to init buddy so that read info is not lost */
> +	ext4_mb_prefetch_fini(ac);
> 	return err;
> }
> 
> @@ -2880,6 +2966,22 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
> 			bio_put(discard_bio);
> 		}
> 	}
> +	if (ext4_has_feature_flex_bg(sb)) {
> +		/* a single flex group is supposed to be read by a single IO */
> +		sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
> +		sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
> +	} else {
> +		sbi->s_mb_prefetch = 32;
> +	}
> +	if (sbi->s_mb_prefetch >= ext4_get_groups_count(sb) >> 2)
> +		sbi->s_mb_prefetch = ext4_get_groups_count(sb) >> 2;
> +	/* now many real IOs to prefetch within a single allocation at cr=0
> +	 * given cr=0 is an CPU-related optimization we shouldn't try to
> +	 * load too many groups, at some point we should start to use what
> +	 * we've got in memory.
> +	 * with an average random access time 5ms, it'd take a second to get
> +	 * 200 groups (* N with flex_bg), so let's make this limit 256 */
> +	sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 256;

This seems a bit on the high side?  For a flex_bg size of 256 this works
out to prefetching 524k bitmaps, which is 64TiB worth of block allocation
(10% of the current "large" 640TiB filesystems).  Without the flex_bg
multiplier this is 256 * 8 = 2048 bitmaps to scan ahead = 8MB which is OK,
but with a flex_bg size of 256 this would be 2GB of bitmaps.  There
should probably be some fixed upper limit, maybe 32768 bitmaps = 128MB
regardless of the flex_bg size?  That is still 4TiB of space to allocate
from, and the sliding window will still keep the bitmaps loading in the
background if the first groups do not have enough free space.

Ideally, s_mb_prefetch_limit would be set by the amount of time it takes
until the first prefetched bitmaps are actually loaded and can be checked.
Once the prefetch pipeline is full, there is no need to prefetch even more
bitmaps that are in danger of being evicted.

> diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
> index 88c98f17e3d9..c96a2bd81f72 100644
> --- a/fs/ext4/mballoc.h
> +++ b/fs/ext4/mballoc.h
> @@ -175,6 +175,8 @@ struct ext4_allocation_context {
> 	struct page *ac_buddy_page;
> 	struct ext4_prealloc_space *ac_pa;
> 	struct ext4_locality_group *ac_lg;
> +	ext4_group_t ac_prefetch;
> +	int ac_prefetch_ios; /* number of initialied prefetch IO */

"initialized" ?

> #define AC_STATUS_CONTINUE	1
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index eb1efad0e20a..a14ce23c1444 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -186,6 +186,8 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
> EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
> EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
> EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
> +EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
> +EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
> EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
> EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
> EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
> @@ -215,6 +217,8 @@ static struct attribute *ext4_attrs[] = {
> 	ATTR_LIST(mb_order2_req),
> 	ATTR_LIST(mb_stream_req),
> 	ATTR_LIST(mb_group_prealloc),
> +	ATTR_LIST(mb_prefetch),
> +	ATTR_LIST(mb_prefetch_limit),
> 	ATTR_LIST(max_writeback_mb_bump),
> 	ATTR_LIST(extent_max_zeroout_kb),
> 	ATTR_LIST(trigger_fs_error),

Cheers, Andreas
Alex Zhuravlev Dec. 3, 2019, 9:36 a.m. UTC | #2
> On 3 Dec 2019, at 00:46, Andreas Dilger <adilger@dilger.ca> wrote:
> 

Thanks for the review, here is the refreshed patch, will think a bit more about skipping in ext4_mb_prefetch()

Thanks, Alex

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 0b202e00d93f..12ebbf482f7c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -397,6 +397,7 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
  * ext4_read_block_bitmap_nowait()
  * @sb:			super block
  * @block_group:	given block group
+ * @ignore_locked:	do not block on locked bh
  *
  * Read the bitmap for a given block_group,and validate the
  * bits for block/inode/inode tables are set in the bitmaps
@@ -404,7 +405,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
  * Return buffer_head on success or NULL in case of failure.
  */
 struct buffer_head *
-ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
+				 bool ignore_locked)
 {
 	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -435,6 +437,13 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
 	if (bitmap_uptodate(bh))
 		goto verify;
 
+	if (ignore_locked && buffer_locked(bh)) {
+		/* buffer under IO already, do not wait
+		 * if called for prefetching */
+		put_bh(bh);
+		return NULL;
+	}
+
 	lock_buffer(bh);
 	if (bitmap_uptodate(bh)) {
 		unlock_buffer(bh);
@@ -524,7 +533,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	struct buffer_head *bh;
 	int err;
 
-	bh = ext4_read_block_bitmap_nowait(sb, block_group);
+	bh = ext4_read_block_bitmap_nowait(sb, block_group, true);
 	if (IS_ERR(bh))
 		return bh;
 	err = ext4_wait_block_bitmap(sb, block_group, bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f8578caba40d..edbd94a53221 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1485,6 +1485,10 @@ struct ext4_sb_info {
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
 	unsigned long s_mb_last_start;
+	/* bitmap prefetch window */
+	unsigned int s_mb_prefetch;
+	/* how many bitmaps to prefetch at most at cr=0 */
+	unsigned int s_mb_prefetch_limit;
 
 	/* stats for buddy allocator */
 	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
@@ -2339,7 +2343,8 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
 
 extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
-						ext4_group_t block_group);
+						ext4_group_t block_group,
+						bool ignore_locked);
 extern int ext4_wait_block_bitmap(struct super_block *sb,
 				  ext4_group_t block_group,
 				  struct buffer_head *bh);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 7c6c34fd8e1c..03cc6f8dd22c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -861,7 +861,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
 			bh[i] = NULL;
 			continue;
 		}
-		bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+		bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
 		if (IS_ERR(bh[i])) {
 			err = PTR_ERR(bh[i]);
 			bh[i] = NULL;
@@ -2103,6 +2103,85 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	return 0;
 }
 
+/*
+ * each allocation context (i.e. a thread doing allocation) has own
+ * sliding prefetch window of @s_mb_prefetch size which starts at the
+ * very first goal and moves ahead of scaning.
+ * a side effect is that subsequent allocations will likely find
+ * the bitmaps in cache or at least in-flight.
+ */
+static void ext4_mb_prefetch(struct ext4_allocation_context *ac,
+			     ext4_group_t start)
+{
+	struct super_block *sb = ac->ac_sb;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_info *grp;
+	ext4_group_t group = start;
+	struct buffer_head *bh;
+	int nr;
+
+	/* limit prefetching at cr=0, otherwise mballoc can
+	 * spend a lot of time loading imperfect groups */
+	if (!ac->ac_criteria && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
+		return;
+
+	/* batch prefetching to get few READs in flight */
+	nr = ac->ac_prefetch - group;
+	if (nr < 0) /* ac_prefetch wrapped to the first groups */
+		nr += ngroups;
+	if (nr > 0)
+		return;
+	BUG_ON(nr < 0);
+
+	nr = sbi->s_mb_prefetch;
+	if (ext4_has_feature_flex_bg(ac->ac_sb)) {
+		/* align to flex_bg to get more bitmas with a single IO */
+		//nr = roundup(group, sbi->s_mb_prefetch);
+		nr = (group / sbi->s_mb_prefetch) * sbi->s_mb_prefetch;
+		nr = nr + sbi->s_mb_prefetch - group;
+	}
+	while (nr-- > 0) {
+		grp = ext4_get_group_info(sb, group);
+		/* ignore empty groups - those will be skipped
+		 * during the scanning as well */
+		if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
+			bh = ext4_read_block_bitmap_nowait(sb, group, true);
+			if (bh && !IS_ERR(bh)) {
+				if (!buffer_uptodate(bh))
+					ac->ac_prefetch_ios++;
+				brelse(bh);
+			}
+		}
+		if (++group >= ngroups)
+			group = 0;
+	}
+	ac->ac_prefetch = group;
+}
+
+/* initialize most recent prefetched groups */
+static void ext4_mb_prefetch_fini(struct ext4_allocation_context *ac)
+{
+	struct ext4_group_info *grp;
+	ext4_group_t group;
+	int nr, rc;
+
+	nr = ac->ac_prefetch_ios;
+	if (nr > EXT4_SB(ac->ac_sb)->s_mb_prefetch)
+		nr = EXT4_SB(ac->ac_sb)->s_mb_prefetch;
+	group = ac->ac_prefetch;
+	while (nr-- > 0) {
+		grp = ext4_get_group_info(ac->ac_sb, group);
+		if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
+			rc = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
+			if (rc)
+				break;
+		}
+		if (group-- == 0)
+			group = ext4_get_groups_count(ac->ac_sb) - 1;
+	}
+}
+
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
@@ -2175,7 +2254,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 		 * searching for the right group start
 		 * from the goal value specified
 		 */
-		group = ac->ac_g_ex.fe_group;
+		group = ac->ac_g_ex.fe_group + 1;
+		ac->ac_prefetch = group;
 
 		for (i = 0; i < ngroups; group++, i++) {
 			int ret = 0;
@@ -2187,6 +2267,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 			if (group >= ngroups)
 				group = 0;
 
+			ext4_mb_prefetch(ac, group);
+
 			/* This now checks without needing the buddy page */
 			ret = ext4_mb_good_group(ac, group, cr);
 			if (ret <= 0) {
@@ -2259,6 +2341,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 out:
 	if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
 		err = first_err;
+	/* use prefetched bitmaps to init buddy so that read info is not lost */
+	ext4_mb_prefetch_fini(ac);
 	return err;
 }
 
@@ -2880,6 +2964,20 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
 			bio_put(discard_bio);
 		}
 	}
+	if (ext4_has_feature_flex_bg(sb)) {
+		/* a single flex group is supposed to be read by a single IO */
+		sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
+		sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
+	} else {
+		sbi->s_mb_prefetch = 32;
+	}
+	if (sbi->s_mb_prefetch >= ext4_get_groups_count(sb) >> 2)
+		sbi->s_mb_prefetch = ext4_get_groups_count(sb) >> 2;
+	/* now many real IOs to prefetch within a single allocation at cr=0
+	 * given cr=0 is an CPU-related optimization we shouldn't try to
+	 * load too many groups, at some point we should start to use what
+	 * we've got in memory */
+	sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 64;
 
 	list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
 		ext4_free_data_in_buddy(sb, entry);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 88c98f17e3d9..73868ffb9a38 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -175,6 +175,8 @@ struct ext4_allocation_context {
 	struct page *ac_buddy_page;
 	struct ext4_prealloc_space *ac_pa;
 	struct ext4_locality_group *ac_lg;
+	ext4_group_t ac_prefetch;
+	int ac_prefetch_ios; /* number of initialized prefetch IOs */
 };
 
 #define AC_STATUS_CONTINUE	1
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index eb1efad0e20a..a14ce23c1444 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -186,6 +186,8 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
 EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
 EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
@@ -215,6 +217,8 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(mb_order2_req),
 	ATTR_LIST(mb_stream_req),
 	ATTR_LIST(mb_group_prealloc),
+	ATTR_LIST(mb_prefetch),
+	ATTR_LIST(mb_prefetch_limit),
 	ATTR_LIST(max_writeback_mb_bump),
 	ATTR_LIST(extent_max_zeroout_kb),
 	ATTR_LIST(trigger_fs_error),
diff mbox series

Patch

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 0b202e00d93f..0bc694c5dcfe 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -404,7 +404,8 @@  static int ext4_validate_block_bitmap(struct super_block *sb,
  * Return buffer_head on success or NULL in case of failure.
  */
 struct buffer_head *
-ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
+				 int ignore_locked)
 {
 	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -435,6 +436,13 @@  ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
 	if (bitmap_uptodate(bh))
 		goto verify;
 
+	if (ignore_locked && buffer_locked(bh)) {
+		/* buffer under IO already, do not wait
+		 * if called for prefetching */
+		put_bh(bh);
+		return NULL;
+	}
+
 	lock_buffer(bh);
 	if (bitmap_uptodate(bh)) {
 		unlock_buffer(bh);
@@ -524,7 +532,7 @@  ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	struct buffer_head *bh;
 	int err;
 
-	bh = ext4_read_block_bitmap_nowait(sb, block_group);
+	bh = ext4_read_block_bitmap_nowait(sb, block_group, 1);
 	if (IS_ERR(bh))
 		return bh;
 	err = ext4_wait_block_bitmap(sb, block_group, bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f8578caba40d..4a7f4ccd8641 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1485,6 +1485,8 @@  struct ext4_sb_info {
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
 	unsigned long s_mb_last_start;
+	unsigned int s_mb_prefetch;
+	unsigned int s_mb_prefetch_limit;
 
 	/* stats for buddy allocator */
 	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
@@ -2339,7 +2341,8 @@  extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
 
 extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
-						ext4_group_t block_group);
+						ext4_group_t block_group,
+						int ignore_locked);
 extern int ext4_wait_block_bitmap(struct super_block *sb,
 				  ext4_group_t block_group,
 				  struct buffer_head *bh);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 7c6c34fd8e1c..e4d93c9a6b77 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -861,7 +861,7 @@  static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
 			bh[i] = NULL;
 			continue;
 		}
-		bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+		bh[i] = ext4_read_block_bitmap_nowait(sb, group, 0);
 		if (IS_ERR(bh[i])) {
 			err = PTR_ERR(bh[i]);
 			bh[i] = NULL;
@@ -2103,6 +2103,87 @@  static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	return 0;
 }
 
+/*
+ * each allocation context (i.e. a thread doing allocation) has own
+ * sliding prefetch window of @s_mb_prefetch size which starts at the
+ * very first goal and moves ahead of scaning.
+ * a side effect is that subsequent allocations will likely find
+ * the bitmaps in cache or at least in-flight.
+ */
+static void
+ext4_mb_prefetch(struct ext4_allocation_context *ac,
+		    ext4_group_t start)
+{
+	struct super_block *sb = ac->ac_sb;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_info *grp;
+	ext4_group_t group = start;
+	struct buffer_head *bh;
+	int nr;
+
+	/* limit prefetching at cr=0, otherwise mballoc can
+	 * spend a lot of time loading imperfect groups */
+	if (!ac->ac_criteria && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
+		return;
+
+	/* batch prefetching to get few READs in flight */
+	nr = ac->ac_prefetch - group;
+	if (ac->ac_prefetch < group)
+		/* wrapped to the first groups */
+		nr += ngroups;
+	if (nr > 0)
+		return;
+	BUG_ON(nr < 0);
+
+	nr = sbi->s_mb_prefetch;
+	if (ext4_has_feature_flex_bg(ac->ac_sb)) {
+		/* align to flex_bg to get more bitmas with a single IO */
+		nr = (group / sbi->s_mb_prefetch) * sbi->s_mb_prefetch;
+		nr = nr + sbi->s_mb_prefetch - group;
+	}
+	while (nr-- > 0) {
+		grp = ext4_get_group_info(sb, group);
+		/* ignore empty groups - those will be skipped
+		 * during the scanning as well */
+		if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
+			bh = ext4_read_block_bitmap_nowait(sb, group, 1);
+			if (bh && !IS_ERR(bh)) {
+				if (!buffer_uptodate(bh))
+					ac->ac_prefetch_ios++;
+				brelse(bh);
+			}
+		}
+		if (++group >= ngroups)
+			group = 0;
+	}
+	ac->ac_prefetch = group;
+}
+
+static void
+ext4_mb_prefetch_fini(struct ext4_allocation_context *ac)
+{
+	struct ext4_group_info *grp;
+	ext4_group_t group;
+	int nr, rc;
+
+	/* initialize last window of prefetched groups */
+	nr = ac->ac_prefetch_ios;
+	if (nr > EXT4_SB(ac->ac_sb)->s_mb_prefetch)
+		nr = EXT4_SB(ac->ac_sb)->s_mb_prefetch;
+	group = ac->ac_prefetch;
+	while (nr-- > 0) {
+		grp = ext4_get_group_info(ac->ac_sb, group);
+		if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
+			rc = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
+			if (rc)
+				break;
+		}
+		if (group-- == 0)
+			group = ext4_get_groups_count(ac->ac_sb) - 1;
+	}
+}
+
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
@@ -2175,7 +2256,8 @@  ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 		 * searching for the right group start
 		 * from the goal value specified
 		 */
-		group = ac->ac_g_ex.fe_group;
+		group = ac->ac_g_ex.fe_group + 1;
+		ac->ac_prefetch = group;
 
 		for (i = 0; i < ngroups; group++, i++) {
 			int ret = 0;
@@ -2187,6 +2269,8 @@  ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 			if (group >= ngroups)
 				group = 0;
 
+			ext4_mb_prefetch(ac, group);
+
 			/* This now checks without needing the buddy page */
 			ret = ext4_mb_good_group(ac, group, cr);
 			if (ret <= 0) {
@@ -2259,6 +2343,8 @@  ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 out:
 	if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
 		err = first_err;
+	/* use prefetched bitmaps to init buddy so that read info is not lost */
+	ext4_mb_prefetch_fini(ac);
 	return err;
 }
 
@@ -2880,6 +2966,22 @@  void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
 			bio_put(discard_bio);
 		}
 	}
+	if (ext4_has_feature_flex_bg(sb)) {
+		/* a single flex group is supposed to be read by a single IO */
+		sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
+		sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
+	} else {
+		sbi->s_mb_prefetch = 32;
+	}
+	if (sbi->s_mb_prefetch >= ext4_get_groups_count(sb) >> 2)
+		sbi->s_mb_prefetch = ext4_get_groups_count(sb) >> 2;
+	/* now many real IOs to prefetch within a single allocation at cr=0
+	 * given cr=0 is an CPU-related optimization we shouldn't try to
+	 * load too many groups, at some point we should start to use what
+	 * we've got in memory.
+	 * with an average random access time 5ms, it'd take a second to get
+	 * 200 groups (* N with flex_bg), so let's make this limit 256 */
+	sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 256;
 
 	list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
 		ext4_free_data_in_buddy(sb, entry);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 88c98f17e3d9..c96a2bd81f72 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -175,6 +175,8 @@  struct ext4_allocation_context {
 	struct page *ac_buddy_page;
 	struct ext4_prealloc_space *ac_pa;
 	struct ext4_locality_group *ac_lg;
+	ext4_group_t ac_prefetch;
+	int ac_prefetch_ios; /* number of initialied prefetch IO */
 };
 
 #define AC_STATUS_CONTINUE	1
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index eb1efad0e20a..a14ce23c1444 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -186,6 +186,8 @@  EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
 EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
 EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
@@ -215,6 +217,8 @@  static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(mb_order2_req),
 	ATTR_LIST(mb_stream_req),
 	ATTR_LIST(mb_group_prealloc),
+	ATTR_LIST(mb_prefetch),
+	ATTR_LIST(mb_prefetch_limit),
 	ATTR_LIST(max_writeback_mb_bump),
 	ATTR_LIST(extent_max_zeroout_kb),
 	ATTR_LIST(trigger_fs_error),