diff mbox series

[1/2] ext4: mballoc - prefetching for bitmaps

Message ID 262A2973-9B2D-4DBE-8752-67E91D52C632@whamcloud.com
State New
Headers show
Series [1/2] ext4: mballoc - prefetching for bitmaps | expand

Commit Message

Alex Zhuravlev May 15, 2020, 10:07 a.m. UTC
This should significantly improve bitmap loading, especially for flex
groups as it tries to load all bitmaps within a flex.group instead of
one by one synchronously.

Prefetching is done in 8 * flex_bg groups, so it should be 8 read-ahead
reads for a single allocating thread. At the end of allocation the
thread waits for read-ahead completion and initializes buddy information
so that read-aheads are not lost in case of memory pressure.

At cr=0 the number of prefetching IOs is limited per allocation context
to prevent a situation when mballoc loads thousands of bitmaps looking
for a perfect group and ignoring groups with good chunks.

Together with the patch "ext4: limit scanning of uninitialized groups"
the mount time (which includes few tiny allocations) of a 1PB filesystem
is reduced significantly:

               0% full    50%-full unpatched    patched
  mount time       33s                9279s       563s

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
---
 fs/ext4/balloc.c  |  12 ++++-
 fs/ext4/ext4.h    |   8 +++-
 fs/ext4/mballoc.c | 116 +++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/mballoc.h |   2 +
 fs/ext4/sysfs.c   |   4 ++
 5 files changed, 138 insertions(+), 4 deletions(-)

Comments

kernel test robot May 20, 2020, 4:45 a.m. UTC | #1
Hi Alex,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on ext4/dev]
[also build test WARNING on v5.7-rc6 next-20200519]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Alex-Zhuravlev/ext4-mballoc-prefetching-for-bitmaps/20200515-181552
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev
config: microblaze-randconfig-r024-20200519 (attached as .config)
compiler: microblaze-linux-gcc (GCC) 9.3.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=microblaze 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kbuild test robot <lkp@intel.com>

All warnings (new ones prefixed by >>, old ones prefixed by <<):

In file included from include/linux/kernel.h:11,
from include/linux/list.h:9,
from include/linux/wait.h:7,
from include/linux/wait_bit.h:8,
from include/linux/fs.h:6,
from fs/ext4/ext4_jbd2.h:15,
from fs/ext4/mballoc.c:12:
fs/ext4/mballoc.c: In function 'ext4_mb_prefetch':
fs/ext4/mballoc.c:2137:12: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits]
2137 |  BUG_ON(nr < 0);
|            ^
include/linux/compiler.h:58:52: note: in definition of macro '__trace_if_var'
58 | #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
|                                                    ^~~~
>> include/asm-generic/bug.h:62:32: note: in expansion of macro 'if'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
|                                ^~
include/linux/compiler.h:48:24: note: in expansion of macro '__branch_check__'
48 | #  define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x)))
|                        ^~~~~~~~~~~~~~~~
include/asm-generic/bug.h:62:36: note: in expansion of macro 'unlikely'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
|                                    ^~~~~~~~
fs/ext4/mballoc.c:2137:2: note: in expansion of macro 'BUG_ON'
2137 |  BUG_ON(nr < 0);
|  ^~~~~~
fs/ext4/mballoc.c:2137:12: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits]
2137 |  BUG_ON(nr < 0);
|            ^
include/linux/compiler.h:58:52: note: in definition of macro '__trace_if_var'
58 | #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
|                                                    ^~~~
>> include/asm-generic/bug.h:62:32: note: in expansion of macro 'if'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
|                                ^~
include/linux/compiler.h:48:24: note: in expansion of macro '__branch_check__'
48 | #  define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x)))
|                        ^~~~~~~~~~~~~~~~
include/asm-generic/bug.h:62:36: note: in expansion of macro 'unlikely'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
|                                    ^~~~~~~~
fs/ext4/mballoc.c:2137:2: note: in expansion of macro 'BUG_ON'
2137 |  BUG_ON(nr < 0);
|  ^~~~~~
fs/ext4/mballoc.c:2137:12: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits]
2137 |  BUG_ON(nr < 0);
|            ^
include/linux/compiler.h:58:61: note: in definition of macro '__trace_if_var'
58 | #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
|                                                             ^~~~
>> include/asm-generic/bug.h:62:32: note: in expansion of macro 'if'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
|                                ^~
include/linux/compiler.h:48:24: note: in expansion of macro '__branch_check__'
48 | #  define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x)))
|                        ^~~~~~~~~~~~~~~~
include/asm-generic/bug.h:62:36: note: in expansion of macro 'unlikely'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
|                                    ^~~~~~~~
fs/ext4/mballoc.c:2137:2: note: in expansion of macro 'BUG_ON'
2137 |  BUG_ON(nr < 0);
|  ^~~~~~
fs/ext4/mballoc.c:2137:12: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits]
2137 |  BUG_ON(nr < 0);
|            ^
include/linux/compiler.h:58:61: note: in definition of macro '__trace_if_var'
58 | #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
|                                                             ^~~~
>> include/asm-generic/bug.h:62:32: note: in expansion of macro 'if'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
|                                ^~
include/linux/compiler.h:48:24: note: in expansion of macro '__branch_check__'
48 | #  define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x)))
|                        ^~~~~~~~~~~~~~~~
include/asm-generic/bug.h:62:36: note: in expansion of macro 'unlikely'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
|                                    ^~~~~~~~
fs/ext4/mballoc.c:2137:2: note: in expansion of macro 'BUG_ON'
2137 |  BUG_ON(nr < 0);
|  ^~~~~~
fs/ext4/mballoc.c:2137:12: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits]
2137 |  BUG_ON(nr < 0);
|            ^
include/linux/compiler.h:69:3: note: in definition of macro '__trace_if_value'
69 |  (cond) ?              |   ^~~~
include/linux/compiler.h:56:28: note: in expansion of macro '__trace_if_var'
56 | #define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
|                            ^~~~~~~~~~~~~~
>> include/asm-generic/bug.h:62:32: note: in expansion of macro 'if'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
|                                ^~
include/linux/compiler.h:48:24: note: in expansion of macro '__branch_check__'
48 | #  define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x)))
|                        ^~~~~~~~~~~~~~~~
include/asm-generic/bug.h:62:36: note: in expansion of macro 'unlikely'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
|                                    ^~~~~~~~
fs/ext4/mballoc.c:2137:2: note: in expansion of macro 'BUG_ON'
2137 |  BUG_ON(nr < 0);
|  ^~~~~~
fs/ext4/mballoc.c:2137:12: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits]
2137 |  BUG_ON(nr < 0);
|            ^
include/linux/compiler.h:69:3: note: in definition of macro '__trace_if_value'
69 |  (cond) ?              |   ^~~~
include/linux/compiler.h:56:28: note: in expansion of macro '__trace_if_var'
56 | #define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
|                            ^~~~~~~~~~~~~~
>> include/asm-generic/bug.h:62:32: note: in expansion of macro 'if'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
|                                ^~
include/linux/compiler.h:48:24: note: in expansion of macro '__branch_check__'
48 | #  define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x)))
|                        ^~~~~~~~~~~~~~~~
include/asm-generic/bug.h:62:36: note: in expansion of macro 'unlikely'
62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
|                                    ^~~~~~~~
fs/ext4/mballoc.c:2137:2: note: in expansion of macro 'BUG_ON'
2137 |  BUG_ON(nr < 0);
|  ^~~~~~

vim +/if +62 include/asm-generic/bug.h

^1da177e4c3f41 Linus Torvalds  2005-04-16  60  
^1da177e4c3f41 Linus Torvalds  2005-04-16  61  #ifndef HAVE_ARCH_BUG_ON
2a41de48b81e61 Alexey Dobriyan 2007-07-17 @62  #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
^1da177e4c3f41 Linus Torvalds  2005-04-16  63  #endif
^1da177e4c3f41 Linus Torvalds  2005-04-16  64  

:::::: The code at line 62 was first introduced by commit
:::::: 2a41de48b81e61fbe260ae5031ebcb6f935f35fb Fix sparse false positives re BUG_ON(ptr)

:::::: TO: Alexey Dobriyan <adobriyan@sw.ru>
:::::: CC: Linus Torvalds <torvalds@woody.linux-foundation.org>

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot May 23, 2020, 7:27 p.m. UTC | #2
Hi Alex,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on ext4/dev]
[also build test WARNING on v5.7-rc6 next-20200522]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Alex-Zhuravlev/ext4-mballoc-prefetching-for-bitmaps/20200515-181552
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev
config: x86_64-lkp (attached as .config)
compiler: gcc-7 (Ubuntu 7.5.0-6ubuntu2) 7.5.0
reproduce (this is a W=1 build):
        # save the attached .config to linux build tree
        make ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kbuild test robot <lkp@intel.com>

All warnings (new ones prefixed by >>, old ones prefixed by <<):

In file included from include/linux/export.h:43:0,
from include/linux/linkage.h:7,
from include/linux/fs.h:5,
from fs/ext4/ext4_jbd2.h:15,
from fs/ext4/mballoc.c:12:
fs/ext4/mballoc.c: In function 'ext4_mb_prefetch':
>> fs/ext4/mballoc.c:2137:12: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits]
BUG_ON(nr < 0);
^
include/linux/compiler.h:78:42: note: in definition of macro 'unlikely'
# define unlikely(x) __builtin_expect(!!(x), 0)
^
fs/ext4/mballoc.c:2137:2: note: in expansion of macro 'BUG_ON'
BUG_ON(nr < 0);
^~~~~~

vim +2137 fs/ext4/mballoc.c

  2106	
  2107	/*
  2108	 * each allocation context (i.e. a thread doing allocation) has own
  2109	 * sliding prefetch window of @s_mb_prefetch size which starts at the
  2110	 * very first goal and moves ahead of scaning.
  2111	 * a side effect is that subsequent allocations will likely find
  2112	 * the bitmaps in cache or at least in-flight.
  2113	 */
  2114	static void
  2115	ext4_mb_prefetch(struct ext4_allocation_context *ac,
  2116			    ext4_group_t start)
  2117	{
  2118		struct super_block *sb = ac->ac_sb;
  2119		ext4_group_t ngroups = ext4_get_groups_count(sb);
  2120		struct ext4_sb_info *sbi = EXT4_SB(sb);
  2121		struct ext4_group_info *grp;
  2122		ext4_group_t nr, group = start;
  2123		struct buffer_head *bh;
  2124	
  2125		/* limit prefetching at cr=0, otherwise mballoc can
  2126		 * spend a lot of time loading imperfect groups */
  2127		if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
  2128			return;
  2129	
  2130		/* batch prefetching to get few READs in flight */
  2131		nr = ac->ac_prefetch - group;
  2132		if (ac->ac_prefetch < group)
  2133			/* wrapped to the first groups */
  2134			nr += ngroups;
  2135		if (nr > 0)
  2136			return;
> 2137		BUG_ON(nr < 0);
  2138	
  2139		nr = sbi->s_mb_prefetch;
  2140		if (ext4_has_feature_flex_bg(sb)) {
  2141			/* align to flex_bg to get more bitmas with a single IO */
  2142			nr = (group / sbi->s_mb_prefetch) * sbi->s_mb_prefetch;
  2143			nr = nr + sbi->s_mb_prefetch - group;
  2144		}
  2145		while (nr-- > 0) {
  2146			grp = ext4_get_group_info(sb, group);
  2147	
  2148			/* prevent expensive getblk() on groups w/ IO in progress */
  2149			if (EXT4_MB_GRP_TEST_AND_SET_READ(grp))
  2150				goto next;
  2151	
  2152			/* ignore empty groups - those will be skipped
  2153			 * during the scanning as well */
  2154			if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
  2155				bh = ext4_read_block_bitmap_nowait(sb, group, true);
  2156				if (bh && !IS_ERR(bh)) {
  2157					if (!buffer_uptodate(bh))
  2158						ac->ac_prefetch_ios++;
  2159					brelse(bh);
  2160				}
  2161			}
  2162	next:
  2163			if (++group >= ngroups)
  2164				group = 0;
  2165		}
  2166		ac->ac_prefetch = group;
  2167	}
  2168	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Theodore Ts'o May 28, 2020, 2:47 p.m. UTC | #3
On Fri, May 15, 2020 at 10:07:06AM +0000, Alex Zhuravlev wrote:
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2127,6 +2127,96 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
>  	return 0;
>  }
>  
> +/*
> + * each allocation context (i.e. a thread doing allocation) has own
> + * sliding prefetch window of @s_mb_prefetch size which starts at the
> + * very first goal and moves ahead of scaning.
> + * a side effect is that subsequent allocations will likely find
> + * the bitmaps in cache or at least in-flight.
> + */
> +static void
> +ext4_mb_prefetch(struct ext4_allocation_context *ac,
> +		    ext4_group_t start)
> +{
> +	struct super_block *sb = ac->ac_sb;
> +	ext4_group_t ngroups = ext4_get_groups_count(sb);
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	struct ext4_group_info *grp;
> +	ext4_group_t nr, group = start;
> +	struct buffer_head *bh;
> +
> +	/* limit prefetching at cr=0, otherwise mballoc can
> +	 * spend a lot of time loading imperfect groups */
> +	if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
> +		return;
> +
> +	/* batch prefetching to get few READs in flight */
> +	nr = ac->ac_prefetch - group;
> +	if (ac->ac_prefetch < group)
> +		/* wrapped to the first groups */
> +		nr += ngroups;
> +	if (nr > 0)I
> +		return;
> +	BUG_ON(nr < 0);

What are you trying to do here?  If nr > 0, we return; if nr < 0, we
BUG() --- but nr is an unsigned int, so we never can trigger --- which
was the warning reported by the kbuild test bot.  So we will only get
past this point if ac_prefetch == group.  But ac_prefetch appears to
be the last group that we prefetched, so it's not clear that the logic
is correct here.

						- Ted
Artem Blagodarenko May 29, 2020, 4:19 p.m. UTC | #4
Hello Alex,

Some remarks bellow, after code quote.

Also, we have encountered directory creating rate drop with this (not exact this, but Lustre FS version) patch. From 70-80K to 30-40K.
Excluding this patch restore rates to the original values.
I am investigating it now. Alex, do you expect this optimisation has impact to names creation?
Is plenty of files and directories creation corner case for this optimisation?

Thanks,
Best regards,
Artem Blagodarenko.

> On 15 May 2020, at 13:07, Alex Zhuravlev <azhuravlev@whamcloud.com> wrote:
> 
> This should significantly improve bitmap loading, especially for flex
> groups as it tries to load all bitmaps within a flex.group instead of
> one by one synchronously.
> 
> Prefetching is done in 8 * flex_bg groups, so it should be 8 read-ahead
> reads for a single allocating thread. At the end of allocation the
> thread waits for read-ahead completion and initializes buddy information
> so that read-aheads are not lost in case of memory pressure.
> 
> At cr=0 the number of prefetching IOs is limited per allocation context
> to prevent a situation when mballoc loads thousands of bitmaps looking
> for a perfect group and ignoring groups with good chunks.
> 
> Together with the patch "ext4: limit scanning of uninitialized groups"
> the mount time (which includes few tiny allocations) of a 1PB filesystem
> is reduced significantly:
> 
>               0% full    50%-full unpatched    patched
>  mount time       33s                9279s       563s
> 
> Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
> Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
> ---
> fs/ext4/balloc.c  |  12 ++++-
> fs/ext4/ext4.h    |   8 +++-
> fs/ext4/mballoc.c | 116 +++++++++++++++++++++++++++++++++++++++++++++-
> fs/ext4/mballoc.h |   2 +
> fs/ext4/sysfs.c   |   4 ++
> 5 files changed, 138 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index a32e5f7b5385..6712146195ed 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -413,7 +413,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
>  * Return buffer_head on success or an ERR_PTR in case of failure.
>  */
> struct buffer_head *
> -ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
> +ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
> +				 bool ignore_locked)
> {
> 	struct ext4_group_desc *desc;
> 	struct ext4_sb_info *sbi = EXT4_SB(sb);
> @@ -444,6 +445,13 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
> 	if (bitmap_uptodate(bh))
> 		goto verify;
> 
> +	if (ignore_locked && buffer_locked(bh)) {
> +		/* buffer under IO already, do not wait
> +		 * if called for prefetching */
> +		put_bh(bh);
> +		return NULL;
> +	}
> +
> 	lock_buffer(bh);
> 	if (bitmap_uptodate(bh)) {
> 		unlock_buffer(bh);
> @@ -534,7 +542,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
> 	struct buffer_head *bh;
> 	int err;
> 
> -	bh = ext4_read_block_bitmap_nowait(sb, block_group);
> +	bh = ext4_read_block_bitmap_nowait(sb, block_group, false);
> 	if (IS_ERR(bh))
> 		return bh;
> 	err = ext4_wait_block_bitmap(sb, block_group, bh);
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 91eb4381cae5..521fbcd8efc7 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1483,6 +1483,8 @@ struct ext4_sb_info {
> 	/* where last allocation was done - for stream allocation */
> 	unsigned long s_mb_last_group;
> 	unsigned long s_mb_last_start;
> +	unsigned int s_mb_prefetch;
> +	unsigned int s_mb_prefetch_limit;
> 
> 	/* stats for buddy allocator */
> 	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
> @@ -2420,7 +2422,8 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
> extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
> 
> extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
> -						ext4_group_t block_group);
> +						ext4_group_t block_group,
> +						bool ignore_locked);
> extern int ext4_wait_block_bitmap(struct super_block *sb,
> 				  ext4_group_t block_group,
> 				  struct buffer_head *bh);
> @@ -3119,6 +3122,7 @@ struct ext4_group_info {
> 	(1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
> #define EXT4_GROUP_INFO_IBITMAP_CORRUPT		\
> 	(1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
> +#define EXT4_GROUP_INFO_BBITMAP_READ_BIT	4
> 
> #define EXT4_MB_GRP_NEED_INIT(grp)	\
> 	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
> @@ -3133,6 +3137,8 @@ struct ext4_group_info {
> 	(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> #define EXT4_MB_GRP_CLEAR_TRIMMED(grp)	\
> 	(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> +#define EXT4_MB_GRP_TEST_AND_SET_READ(grp)	\
> +	(test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
> 
> #define EXT4_MAX_CONTENTION		8
> #define EXT4_CONTENTION_THRESHOLD	2
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index afb8bd9a10e9..ebfe258bfd0f 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -861,7 +861,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
> 			bh[i] = NULL;
> 			continue;
> 		}
> -		bh[i] = ext4_read_block_bitmap_nowait(sb, group);
> +		bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
> 		if (IS_ERR(bh[i])) {
> 			err = PTR_ERR(bh[i]);
> 			bh[i] = NULL;
> @@ -2127,6 +2127,96 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
> 	return 0;
> }
> 
> +/*
> + * each allocation context (i.e. a thread doing allocation) has own
> + * sliding prefetch window of @s_mb_prefetch size which starts at the
> + * very first goal and moves ahead of scaning.
> + * a side effect is that subsequent allocations will likely find
> + * the bitmaps in cache or at least in-flight.
> + */
> +static void
> +ext4_mb_prefetch(struct ext4_allocation_context *ac,
> +		    ext4_group_t start)
> +{
> +	struct super_block *sb = ac->ac_sb;
> +	ext4_group_t ngroups = ext4_get_groups_count(sb);
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	struct ext4_group_info *grp;
> +	ext4_group_t nr, group = start;
> +	struct buffer_head *bh;
> +

Can be useful giving an ability to disable this optimisation? As option, by setting s_mb_prefetch to zero.
Now 0 at s_mb_prefetch allows to skip the optimisation for cr=0 and cr=1. 

Something like.

If (sbi->s_mb_prefetch == 0)
	return;

Some changes in ext4_mb_prefetch_fini() are also probably required.

> +	/* limit prefetching at cr=0, otherwise mballoc can
> +	 * spend a lot of time loading imperfect groups */
> +	if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
> +		return;

A comment above says prefetching is limited for cr=0 but code limit it for cr=0 and cr=1.
Do you need change the comment or code?

> +
> +	/* batch prefetching to get few READs in flight */
> +	nr = ac->ac_prefetch - group;
> +	if (ac->ac_prefetch < group)
> +		/* wrapped to the first groups */
> +		nr += ngroups;
> +	if (nr > 0)
> +		return;
> +	BUG_ON(nr < 0);
> +
> +	nr = sbi->s_mb_prefetch;
> +	if (ext4_has_feature_flex_bg(sb)) {
> +		/* align to flex_bg to get more bitmas with a single IO */
> +		nr = (group / sbi->s_mb_prefetch) * sbi->s_mb_prefetch;
> +		nr = nr + sbi->s_mb_prefetch - group;
> +	}
> +	while (nr-- > 0) {
> +		grp = ext4_get_group_info(sb, group);
> +
> +		/* prevent expensive getblk() on groups w/ IO in progress */
> +		if (EXT4_MB_GRP_TEST_AND_SET_READ(grp))
> +			goto next;
> +
> +		/* ignore empty groups - those will be skipped
> +		 * during the scanning as well */
> +		if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
> +			bh = ext4_read_block_bitmap_nowait(sb, group, true);
> +			if (bh && !IS_ERR(bh)) {
> +				if (!buffer_uptodate(bh))
> +					ac->ac_prefetch_ios++;
> +				brelse(bh);
> +			}
> +		}
> +next:
> +		if (++group >= ngroups)
> +			group = 0;
> +	}
> +	ac->ac_prefetch = group;
> +}
> +
> +static void
> +ext4_mb_prefetch_fini(struct ext4_allocation_context *ac)
> +{
> +	struct ext4_group_info *grp;
> +	ext4_group_t group;
> +	int nr, rc;
> +
> +	/* initialize last window of prefetched groups */
> +	nr = ac->ac_prefetch_ios;
> +	if (nr > EXT4_SB(ac->ac_sb)->s_mb_prefetch)
> +		nr = EXT4_SB(ac->ac_sb)->s_mb_prefetch;
> +	group = ac->ac_prefetch;
> +	if (!group)
> +		group = ext4_get_groups_count(ac->ac_sb);
> +	group--;
> +	while (nr-- > 0) {
> +		grp = ext4_get_group_info(ac->ac_sb, group);
> +		if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
> +			rc = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
> +			if (rc)
> +				break;
> +		}
> +		if (!group)
> +			group = ext4_get_groups_count(ac->ac_sb);
> +		group--;
> +	}
> +}
> +
> static noinline_for_stack int
> ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> {
> @@ -2200,6 +2290,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> 		 * from the goal value specified
> 		 */
> 		group = ac->ac_g_ex.fe_group;
> +		ac->ac_prefetch = group;
> 
> 		for (i = 0; i < ngroups; group++, i++) {
> 			int ret = 0;
> @@ -2211,6 +2302,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> 			if (group >= ngroups)
> 				group = 0;
> 
> +			ext4_mb_prefetch(ac, group);
> +
> 			/* This now checks without needing the buddy page */
> 			ret = ext4_mb_good_group(ac, group, cr);
> 			if (ret <= 0) {
> @@ -2283,6 +2376,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> out:
> 	if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
> 		err = first_err;
> +	/* use prefetched bitmaps to init buddy so that read info is not lost */
> +	ext4_mb_prefetch_fini(ac);
> 	return err;
> }
> 
> @@ -2542,6 +2637,25 @@ static int ext4_mb_init_backend(struct super_block *sb)
> 			goto err_freebuddy;
> 	}
> 
> +	if (ext4_has_feature_flex_bg(sb)) {
> +		/* a single flex group is supposed to be read by a single IO */
> +		sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
> +		sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
> +	} else {
> +		sbi->s_mb_prefetch = 32;
> +	}
> +	if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
> +		sbi->s_mb_prefetch = ext4_get_groups_count(sb);
> +	/* now many real IOs to prefetch within a single allocation at cr=0

Do you mean “how many” here? At cr=0 and cr=1?

> +	 * given cr=0 is an CPU-related optimization we shouldn't try to
> +	 * load too many groups, at some point we should start to use what
> +	 * we've got in memory.
> +	 * with an average random access time 5ms, it'd take a second to get
> +	 * 200 groups (* N with flex_bg), so let's make this limit 4 */
> +	sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
> +	if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
> +		sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
> +
> 	return 0;
> 
> err_freebuddy:
> diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
> index 88c98f17e3d9..c96a2bd81f72 100644
> --- a/fs/ext4/mballoc.h
> +++ b/fs/ext4/mballoc.h
> @@ -175,6 +175,8 @@ struct ext4_allocation_context {
> 	struct page *ac_buddy_page;
> 	struct ext4_prealloc_space *ac_pa;
> 	struct ext4_locality_group *ac_lg;
> +	ext4_group_t ac_prefetch;
> +	int ac_prefetch_ios; /* number of initialied prefetch IO */
> };
> 
> #define AC_STATUS_CONTINUE	1
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index 04bfaf63752c..5f443f9d54b8 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -240,6 +240,8 @@ EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32);
> EXT4_ATTR(first_error_time, 0444, first_error_time);
> EXT4_ATTR(last_error_time, 0444, last_error_time);
> EXT4_ATTR(journal_task, 0444, journal_task);
> +EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
> +EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
> 
> static unsigned int old_bump_val = 128;
> EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
> @@ -283,6 +285,8 @@ static struct attribute *ext4_attrs[] = {
> #ifdef CONFIG_EXT4_DEBUG
> 	ATTR_LIST(simulate_fail),
> #endif
> +	ATTR_LIST(mb_prefetch),
> +	ATTR_LIST(mb_prefetch_limit),
> 	NULL,
> };
> ATTRIBUTE_GROUPS(ext4);
> -- 
> 2.21.3
>
Alex Zhuravlev May 30, 2020, 4:57 a.m. UTC | #5
> On 28 May 2020, at 17:47, Theodore Y. Ts'o <tytso@mit.edu> wrote:
> 
> What are you trying to do here?  If nr > 0, we return; if nr < 0, we
> BUG() --- but nr is an unsigned int, so we never can trigger --- which
> was the warning reported by the kbuild test bot.  So we will only get
> past this point if ac_prefetch == group.  But ac_prefetch appears to
> be the last group that we prefetched, so it's not clear that the logic
> is correct here.

You’re right, this part “evolved” since the initial version, but I forgot to make it clear.
Basically this should be replaced with:

If (ac->ac_prefetch != group)
	return;

ac->ac_prefetch is just a cursor for the current process.

Thanks, Alex
Alex Zhuravlev May 30, 2020, 5:01 a.m. UTC | #6
Hi

> On 29 May 2020, at 19:19, Благодаренко Артём <artem.blagodarenko@gmail.com> wrote:
> 
> Also, we have encountered directory creating rate drop with this (not exact this, but Lustre FS version) patch. From 70-80K to 30-40K.
> Excluding this patch restore rates to the original values.
> I am investigating it now. Alex, do you expect this optimisation has impact to names creation?
> Is plenty of files and directories creation corner case for this optimisation?

Noticed as well, the last version posted to the list should have this problem fixed.

> 
> Can be useful giving an ability to disable this optimisation? As option, by setting s_mb_prefetch to zero.
> Now 0 at s_mb_prefetch allows to skip the optimisation for cr=0 and cr=1. 

s_mb_prefetch_limit=0 can be used to disable prefetching?

>> +	/* limit prefetching at cr=0, otherwise mballoc can
>> +	 * spend a lot of time loading imperfect groups */
>> +	if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
>> +		return;
> 
> A comment above says prefetching is limited for cr=0 but code limit it for cr=0 and cr=1.
> Do you need change the comment or code?

OK

>> +	if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
>> +		sbi->s_mb_prefetch = ext4_get_groups_count(sb);
>> +	/* now many real IOs to prefetch within a single allocation at cr=0
> 
> Do you mean “how many” here? At cr=0 and cr=1?

Yes, of course


Thanks, Alex
Theodore Ts'o June 19, 2020, 5:16 p.m. UTC | #7
On Sat, May 30, 2020 at 05:01:40AM +0000, Alex Zhuravlev wrote:
> 
> Hi
> 
> > On 29 May 2020, at 19:19, Благодаренко Артём <artem.blagodarenko@gmail.com> wrote:
> > 
> > Also, we have encountered directory creating rate drop with this (not exact this, but Lustre FS version) patch. From 70-80K to 30-40K.
> > Excluding this patch restore rates to the original values.
> > I am investigating it now. Alex, do you expect this optimisation has impact to names creation?
> > Is plenty of files and directories creation corner case for this optimisation?
> 
> Noticed as well, the last version posted to the list should have this problem fixed.

I'm not seeing any newer version of this patch since the one on this
thread, posted on May 15th.

Am I missing something?   What's the latest version of this patch that you have?

     	     		  	     	    - Ted
Alex Zhuravlev June 19, 2020, 6:23 p.m. UTC | #8
Please find it attached, rebased against latest tree.

Thanks, Alex


> On 19 Jun 2020, at 20:16, Theodore Y. Ts'o <tytso@mit.edu> wrote:
> 
> I'm not seeing any newer version of this patch since the one on this
> thread, posted on May 15th.
> 
> Am I missing something?   What's the latest version of this patch that you have?
> 
>     	     		  	     	    - Ted


From 023a44b30b1c77873401ee310705933482a89b02 Mon Sep 17 00:00:00 2001
From: Alex Zhuravlev <bzzz@whamcloud.com>
Date: Tue, 21 Apr 2020 10:54:07 +0300
Subject: [PATCH] [PATCH 1/2] ext4:  mballoc - prefetching for bitmaps

This should significantly improve bitmap loading, especially for flex
groups as it tries to load all bitmaps within a flex.group instead of
one by one synchronously.

Prefetching is done in 8 * flex_bg groups, so it should be 8 read-ahead
reads for a single allocating thread. At the end of allocation the
thread waits for read-ahead completion and initializes buddy information
so that read-aheads are not lost in case of memory pressure.

At cr=0 the number of prefetching IOs is limited per allocation context
to prevent a situation when mballoc loads thousands of bitmaps looking
for a perfect group and ignoring groups with good chunks.

Together with the patch "ext4: limit scanning of uninitialized groups"
the mount time (which includes few tiny allocations) of a 1PB filesystem
is reduced significantly:

               0% full    50%-full unpatched    patched
  mount time       33s                9279s       563s

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
---
 fs/ext4/balloc.c  |  12 ++++-
 fs/ext4/ext4.h    |   8 +++-
 fs/ext4/mballoc.c | 113 +++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/mballoc.h |   2 +
 fs/ext4/sysfs.c   |   4 ++
 5 files changed, 135 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1ba46d87cdf1..c52011af4812 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -413,7 +413,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
  * Return buffer_head on success or an ERR_PTR in case of failure.
  */
 struct buffer_head *
-ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
+				 bool ignore_locked)
 {
 	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -444,6 +445,13 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
 	if (bitmap_uptodate(bh))
 		goto verify;
 
+	if (ignore_locked && buffer_locked(bh)) {
+		/* buffer under IO already, do not wait
+		 * if called for prefetching */
+		put_bh(bh);
+		return NULL;
+	}
+
 	lock_buffer(bh);
 	if (bitmap_uptodate(bh)) {
 		unlock_buffer(bh);
@@ -534,7 +542,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	struct buffer_head *bh;
 	int err;
 
-	bh = ext4_read_block_bitmap_nowait(sb, block_group);
+	bh = ext4_read_block_bitmap_nowait(sb, block_group, false);
 	if (IS_ERR(bh))
 		return bh;
 	err = ext4_wait_block_bitmap(sb, block_group, bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 42f5060f3cdf..7451662e092a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1505,6 +1505,8 @@ struct ext4_sb_info {
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
 	unsigned long s_mb_last_start;
+	unsigned int s_mb_prefetch;
+	unsigned int s_mb_prefetch_limit;
 
 	/* stats for buddy allocator */
 	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
@@ -2446,7 +2448,8 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
 
 extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
-						ext4_group_t block_group);
+						ext4_group_t block_group,
+						bool ignore_locked);
 extern int ext4_wait_block_bitmap(struct super_block *sb,
 				  ext4_group_t block_group,
 				  struct buffer_head *bh);
@@ -3145,6 +3148,7 @@ struct ext4_group_info {
 	(1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
 #define EXT4_GROUP_INFO_IBITMAP_CORRUPT		\
 	(1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
+#define EXT4_GROUP_INFO_BBITMAP_READ_BIT	4
 
 #define EXT4_MB_GRP_NEED_INIT(grp)	\
 	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
@@ -3159,6 +3163,8 @@ struct ext4_group_info {
 	(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
 #define EXT4_MB_GRP_CLEAR_TRIMMED(grp)	\
 	(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_TEST_AND_SET_READ(grp)	\
+	(test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
 
 #define EXT4_MAX_CONTENTION		8
 #define EXT4_CONTENTION_THRESHOLD	2
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9d69f0a1372d..88af980e945e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -922,7 +922,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
 			bh[i] = NULL;
 			continue;
 		}
-		bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+		bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
 		if (IS_ERR(bh[i])) {
 			err = PTR_ERR(bh[i]);
 			bh[i] = NULL;
@@ -2244,6 +2244,91 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
 	return ret;
 }
 
+/*
+ * each allocation context (i.e. a thread doing allocation) has own
+ * sliding prefetch window of @s_mb_prefetch size which starts at the
+ * very first goal and moves ahead of scaning.
+ * a side effect is that subsequent allocations will likely find
+ * the bitmaps in cache or at least in-flight.
+ */
+static void
+ext4_mb_prefetch(struct ext4_allocation_context *ac,
+		    ext4_group_t start)
+{
+	struct super_block *sb = ac->ac_sb;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_info *grp;
+	ext4_group_t nr, group = start;
+	struct buffer_head *bh;
+
+	/* limit prefetching at cr=0/1, otherwise mballoc can
+	 * spend a lot of time loading imperfect groups */
+	if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
+		return;
+
+	/* batch prefetching to get few READs in flight */
+	if (ac->ac_prefetch != group)
+		return;
+
+	nr = sbi->s_mb_prefetch;
+	if (ext4_has_feature_flex_bg(sb)) {
+		/* align to flex_bg to get more bitmaps with a single IO */
+		nr = (group / sbi->s_mb_prefetch) * sbi->s_mb_prefetch;
+		nr = nr + sbi->s_mb_prefetch - group;
+	}
+	while (nr-- > 0) {
+		grp = ext4_get_group_info(sb, group);
+
+		/* prevent expensive getblk() on groups w/ IO in progress */
+		if (EXT4_MB_GRP_TEST_AND_SET_READ(grp))
+			goto next;
+
+		/* ignore empty groups - those will be skipped
+		 * during the scanning as well */
+		if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
+			bh = ext4_read_block_bitmap_nowait(sb, group, true);
+			if (bh && !IS_ERR(bh)) {
+				if (!buffer_uptodate(bh))
+					ac->ac_prefetch_ios++;
+				brelse(bh);
+			}
+		}
+next:
+		if (++group >= ngroups)
+			group = 0;
+	}
+	ac->ac_prefetch = group;
+}
+
+static void
+ext4_mb_prefetch_fini(struct ext4_allocation_context *ac)
+{
+	struct ext4_group_info *grp;
+	ext4_group_t group;
+	int nr, rc;
+
+	/* initialize last window of prefetched groups */
+	nr = ac->ac_prefetch_ios;
+	if (nr > EXT4_SB(ac->ac_sb)->s_mb_prefetch)
+		nr = EXT4_SB(ac->ac_sb)->s_mb_prefetch;
+	group = ac->ac_prefetch;
+	if (!group)
+		group = ext4_get_groups_count(ac->ac_sb);
+	group--;
+	while (nr-- > 0) {
+		grp = ext4_get_group_info(ac->ac_sb, group);
+		if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
+			rc = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
+			if (rc)
+				break;
+		}
+		if (!group)
+			group = ext4_get_groups_count(ac->ac_sb);
+		group--;
+	}
+}
+
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
@@ -2317,6 +2402,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 		 * from the goal value specified
 		 */
 		group = ac->ac_g_ex.fe_group;
+		ac->ac_prefetch = group;
 
 		for (i = 0; i < ngroups; group++, i++) {
 			int ret = 0;
@@ -2328,6 +2414,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 			if (group >= ngroups)
 				group = 0;
 
+			ext4_mb_prefetch(ac, group);
+
 			/* This now checks without needing the buddy page */
 			ret = ext4_mb_good_group_nolock(ac, group, cr);
 			if (ret <= 0) {
@@ -2402,6 +2490,10 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
 		 ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
 		 ac->ac_flags, cr, err);
+
+	/* use prefetched bitmaps to init buddy so that read info is not lost */
+	ext4_mb_prefetch_fini(ac);
+
 	return err;
 }
 
@@ -2648,6 +2740,25 @@ static int ext4_mb_init_backend(struct super_block *sb)
 			goto err_freebuddy;
 	}
 
+	if (ext4_has_feature_flex_bg(sb)) {
+		/* a single flex group is supposed to be read by a single IO */
+		sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
+		sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
+	} else {
+		sbi->s_mb_prefetch = 32;
+	}
+	if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
+		sbi->s_mb_prefetch = ext4_get_groups_count(sb);
+	/* now many real IOs to prefetch within a single allocation at cr=0
+	 * given cr=0 is an CPU-related optimization we shouldn't try to
+	 * load too many groups, at some point we should start to use what
+	 * we've got in memory.
+	 * with an average random access time 5ms, it'd take a second to get
+	 * 200 groups (* N with flex_bg), so let's make this limit 4 */
+	sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
+	if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
+		sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
+
 	return 0;
 
 err_freebuddy:
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 6b4d17c2935d..45103cdf08cb 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -171,6 +171,8 @@ struct ext4_allocation_context {
 	struct page *ac_buddy_page;
 	struct ext4_prealloc_space *ac_pa;
 	struct ext4_locality_group *ac_lg;
+	ext4_group_t ac_prefetch;
+	int ac_prefetch_ios; /* number of initialied prefetch IO */
 };
 
 #define AC_STATUS_CONTINUE	1
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 6c9fc9e21c13..31e0db726d21 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -240,6 +240,8 @@ EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32);
 EXT4_ATTR(first_error_time, 0444, first_error_time);
 EXT4_ATTR(last_error_time, 0444, last_error_time);
 EXT4_ATTR(journal_task, 0444, journal_task);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
 
 static unsigned int old_bump_val = 128;
 EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -283,6 +285,8 @@ static struct attribute *ext4_attrs[] = {
 #ifdef CONFIG_EXT4_DEBUG
 	ATTR_LIST(simulate_fail),
 #endif
+	ATTR_LIST(mb_prefetch),
+	ATTR_LIST(mb_prefetch_limit),
 	NULL,
 };
 ATTRIBUTE_GROUPS(ext4);
diff mbox series

Patch

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index a32e5f7b5385..6712146195ed 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -413,7 +413,8 @@  static int ext4_validate_block_bitmap(struct super_block *sb,
  * Return buffer_head on success or an ERR_PTR in case of failure.
  */
 struct buffer_head *
-ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
+				 bool ignore_locked)
 {
 	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -444,6 +445,13 @@  ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
 	if (bitmap_uptodate(bh))
 		goto verify;
 
+	if (ignore_locked && buffer_locked(bh)) {
+		/* buffer under IO already, do not wait
+		 * if called for prefetching */
+		put_bh(bh);
+		return NULL;
+	}
+
 	lock_buffer(bh);
 	if (bitmap_uptodate(bh)) {
 		unlock_buffer(bh);
@@ -534,7 +542,7 @@  ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	struct buffer_head *bh;
 	int err;
 
-	bh = ext4_read_block_bitmap_nowait(sb, block_group);
+	bh = ext4_read_block_bitmap_nowait(sb, block_group, false);
 	if (IS_ERR(bh))
 		return bh;
 	err = ext4_wait_block_bitmap(sb, block_group, bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 91eb4381cae5..521fbcd8efc7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1483,6 +1483,8 @@  struct ext4_sb_info {
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
 	unsigned long s_mb_last_start;
+	unsigned int s_mb_prefetch;
+	unsigned int s_mb_prefetch_limit;
 
 	/* stats for buddy allocator */
 	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
@@ -2420,7 +2422,8 @@  extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
 
 extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
-						ext4_group_t block_group);
+						ext4_group_t block_group,
+						bool ignore_locked);
 extern int ext4_wait_block_bitmap(struct super_block *sb,
 				  ext4_group_t block_group,
 				  struct buffer_head *bh);
@@ -3119,6 +3122,7 @@  struct ext4_group_info {
 	(1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
 #define EXT4_GROUP_INFO_IBITMAP_CORRUPT		\
 	(1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
+#define EXT4_GROUP_INFO_BBITMAP_READ_BIT	4
 
 #define EXT4_MB_GRP_NEED_INIT(grp)	\
 	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
@@ -3133,6 +3137,8 @@  struct ext4_group_info {
 	(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
 #define EXT4_MB_GRP_CLEAR_TRIMMED(grp)	\
 	(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_TEST_AND_SET_READ(grp)	\
+	(test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
 
 #define EXT4_MAX_CONTENTION		8
 #define EXT4_CONTENTION_THRESHOLD	2
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index afb8bd9a10e9..ebfe258bfd0f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -861,7 +861,7 @@  static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
 			bh[i] = NULL;
 			continue;
 		}
-		bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+		bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
 		if (IS_ERR(bh[i])) {
 			err = PTR_ERR(bh[i]);
 			bh[i] = NULL;
@@ -2127,6 +2127,96 @@  static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	return 0;
 }
 
+/*
+ * each allocation context (i.e. a thread doing allocation) has own
+ * sliding prefetch window of @s_mb_prefetch size which starts at the
+ * very first goal and moves ahead of scaning.
+ * a side effect is that subsequent allocations will likely find
+ * the bitmaps in cache or at least in-flight.
+ */
+static void
+ext4_mb_prefetch(struct ext4_allocation_context *ac,
+		    ext4_group_t start)
+{
+	struct super_block *sb = ac->ac_sb;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_info *grp;
+	ext4_group_t nr, group = start;
+	struct buffer_head *bh;
+
+	/* limit prefetching at cr=0, otherwise mballoc can
+	 * spend a lot of time loading imperfect groups */
+	if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit)
+		return;
+
+	/* batch prefetching to get few READs in flight */
+	nr = ac->ac_prefetch - group;
+	if (ac->ac_prefetch < group)
+		/* wrapped to the first groups */
+		nr += ngroups;
+	if (nr > 0)
+		return;
+	BUG_ON(nr < 0);
+
+	nr = sbi->s_mb_prefetch;
+	if (ext4_has_feature_flex_bg(sb)) {
+		/* align to flex_bg to get more bitmas with a single IO */
+		nr = (group / sbi->s_mb_prefetch) * sbi->s_mb_prefetch;
+		nr = nr + sbi->s_mb_prefetch - group;
+	}
+	while (nr-- > 0) {
+		grp = ext4_get_group_info(sb, group);
+
+		/* prevent expensive getblk() on groups w/ IO in progress */
+		if (EXT4_MB_GRP_TEST_AND_SET_READ(grp))
+			goto next;
+
+		/* ignore empty groups - those will be skipped
+		 * during the scanning as well */
+		if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
+			bh = ext4_read_block_bitmap_nowait(sb, group, true);
+			if (bh && !IS_ERR(bh)) {
+				if (!buffer_uptodate(bh))
+					ac->ac_prefetch_ios++;
+				brelse(bh);
+			}
+		}
+next:
+		if (++group >= ngroups)
+			group = 0;
+	}
+	ac->ac_prefetch = group;
+}
+
+static void
+ext4_mb_prefetch_fini(struct ext4_allocation_context *ac)
+{
+	struct ext4_group_info *grp;
+	ext4_group_t group;
+	int nr, rc;
+
+	/* initialize last window of prefetched groups */
+	nr = ac->ac_prefetch_ios;
+	if (nr > EXT4_SB(ac->ac_sb)->s_mb_prefetch)
+		nr = EXT4_SB(ac->ac_sb)->s_mb_prefetch;
+	group = ac->ac_prefetch;
+	if (!group)
+		group = ext4_get_groups_count(ac->ac_sb);
+	group--;
+	while (nr-- > 0) {
+		grp = ext4_get_group_info(ac->ac_sb, group);
+		if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) {
+			rc = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
+			if (rc)
+				break;
+		}
+		if (!group)
+			group = ext4_get_groups_count(ac->ac_sb);
+		group--;
+	}
+}
+
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
@@ -2200,6 +2290,7 @@  ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 		 * from the goal value specified
 		 */
 		group = ac->ac_g_ex.fe_group;
+		ac->ac_prefetch = group;
 
 		for (i = 0; i < ngroups; group++, i++) {
 			int ret = 0;
@@ -2211,6 +2302,8 @@  ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 			if (group >= ngroups)
 				group = 0;
 
+			ext4_mb_prefetch(ac, group);
+
 			/* This now checks without needing the buddy page */
 			ret = ext4_mb_good_group(ac, group, cr);
 			if (ret <= 0) {
@@ -2283,6 +2376,8 @@  ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 out:
 	if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
 		err = first_err;
+	/* use prefetched bitmaps to init buddy so that read info is not lost */
+	ext4_mb_prefetch_fini(ac);
 	return err;
 }
 
@@ -2542,6 +2637,25 @@  static int ext4_mb_init_backend(struct super_block *sb)
 			goto err_freebuddy;
 	}
 
+	if (ext4_has_feature_flex_bg(sb)) {
+		/* a single flex group is supposed to be read by a single IO */
+		sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
+		sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
+	} else {
+		sbi->s_mb_prefetch = 32;
+	}
+	if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
+		sbi->s_mb_prefetch = ext4_get_groups_count(sb);
+	/* now many real IOs to prefetch within a single allocation at cr=0
+	 * given cr=0 is an CPU-related optimization we shouldn't try to
+	 * load too many groups, at some point we should start to use what
+	 * we've got in memory.
+	 * with an average random access time 5ms, it'd take a second to get
+	 * 200 groups (* N with flex_bg), so let's make this limit 4 */
+	sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
+	if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
+		sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
+
 	return 0;
 
 err_freebuddy:
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 88c98f17e3d9..c96a2bd81f72 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -175,6 +175,8 @@  struct ext4_allocation_context {
 	struct page *ac_buddy_page;
 	struct ext4_prealloc_space *ac_pa;
 	struct ext4_locality_group *ac_lg;
+	ext4_group_t ac_prefetch;
+	int ac_prefetch_ios; /* number of initialied prefetch IO */
 };
 
 #define AC_STATUS_CONTINUE	1
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 04bfaf63752c..5f443f9d54b8 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -240,6 +240,8 @@  EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32);
 EXT4_ATTR(first_error_time, 0444, first_error_time);
 EXT4_ATTR(last_error_time, 0444, last_error_time);
 EXT4_ATTR(journal_task, 0444, journal_task);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
 
 static unsigned int old_bump_val = 128;
 EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -283,6 +285,8 @@  static struct attribute *ext4_attrs[] = {
 #ifdef CONFIG_EXT4_DEBUG
 	ATTR_LIST(simulate_fail),
 #endif
+	ATTR_LIST(mb_prefetch),
+	ATTR_LIST(mb_prefetch_limit),
 	NULL,
 };
 ATTRIBUTE_GROUPS(ext4);