diff mbox

[5/6] libext2fs/e2fsck: provide routines to read-ahead metadata

Message ID 20140809042643.2441.79312.stgit@birch.djwong.org
State Superseded, archived
Headers show

Commit Message

Darrick Wong Aug. 9, 2014, 4:26 a.m. UTC
This patch adds to e2fsck the ability to pre-fetch metadata into the
page cache in the hopes of speeding up fsck runs.  There are two new
functions -- the first allows a caller to readahead a list of blocks,
and the second is a helper function that uses that first mechanism to
load group data (bitmaps, inode tables).

These new e2fsck routines require the addition of a dblist API to
allow us to iterate a subset of a dblist.  This will enable
incremental directory block readahead in e2fsck pass 2.

There's also a function to estimate the readahead given a FS.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 configure           |    2 
 configure.in        |    1 
 e2fsck/Makefile.in  |    8 +-
 e2fsck/e2fsck.h     |   18 ++++
 e2fsck/readahead.c  |  213 +++++++++++++++++++++++++++++++++++++++++++++++++++
 e2fsck/util.c       |   51 ++++++++++++
 lib/config.h.in     |    3 +
 lib/ext2fs/dblist.c |   21 ++++-
 lib/ext2fs/ext2fs.h |   10 ++
 9 files changed, 319 insertions(+), 8 deletions(-)
 create mode 100644 e2fsck/readahead.c



--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Darrick Wong Aug. 11, 2014, 5:21 a.m. UTC | #1
On Fri, Aug 08, 2014 at 09:26:43PM -0700, Darrick J. Wong wrote:
> This patch adds to e2fsck the ability to pre-fetch metadata into the
> page cache in the hopes of speeding up fsck runs.  There are two new
> functions -- the first allows a caller to readahead a list of blocks,
> and the second is a helper function that uses that first mechanism to
> load group data (bitmaps, inode tables).
> 
> These new e2fsck routines require the addition of a dblist API to
> allow us to iterate a subset of a dblist.  This will enable
> incremental directory block readahead in e2fsck pass 2.
> 
> There's also a function to estimate the readahead given a FS.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  configure           |    2 
>  configure.in        |    1 
>  e2fsck/Makefile.in  |    8 +-
>  e2fsck/e2fsck.h     |   18 ++++
>  e2fsck/readahead.c  |  213 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  e2fsck/util.c       |   51 ++++++++++++
>  lib/config.h.in     |    3 +
>  lib/ext2fs/dblist.c |   21 ++++-
>  lib/ext2fs/ext2fs.h |   10 ++
>  9 files changed, 319 insertions(+), 8 deletions(-)
>  create mode 100644 e2fsck/readahead.c
> 
> 
> diff --git a/configure b/configure
> index 8ad1408..71778e1 100755
> --- a/configure
> +++ b/configure
> @@ -12404,7 +12404,7 @@ fi
>  done
>  
>  fi
> -for ac_header in  	dirent.h 	errno.h 	execinfo.h 	getopt.h 	malloc.h 	mntent.h 	paths.h 	semaphore.h 	setjmp.h 	signal.h 	stdarg.h 	stdint.h 	stdlib.h 	termios.h 	termio.h 	unistd.h 	utime.h 	attr/xattr.h 	linux/falloc.h 	linux/fd.h 	linux/major.h 	linux/loop.h 	net/if_dl.h 	netinet/in.h 	sys/disklabel.h 	sys/disk.h 	sys/file.h 	sys/ioctl.h 	sys/mkdev.h 	sys/mman.h 	sys/mount.h 	sys/prctl.h 	sys/resource.h 	sys/select.h 	sys/socket.h 	sys/sockio.h 	sys/stat.h 	sys/syscall.h 	sys/sysmacros.h 	sys/time.h 	sys/types.h 	sys/un.h 	sys/wait.h
> +for ac_header in  	dirent.h 	errno.h 	execinfo.h 	getopt.h 	malloc.h 	mntent.h 	paths.h 	semaphore.h 	setjmp.h 	signal.h 	stdarg.h 	stdint.h 	stdlib.h 	termios.h 	termio.h 	unistd.h 	utime.h 	attr/xattr.h 	linux/falloc.h 	linux/fd.h 	linux/major.h 	linux/loop.h 	net/if_dl.h 	netinet/in.h 	sys/disklabel.h 	sys/disk.h 	sys/file.h 	sys/ioctl.h 	sys/mkdev.h 	sys/mman.h 	sys/mount.h 	sys/prctl.h 	sys/resource.h 	sys/select.h 	sys/socket.h 	sys/sockio.h 	sys/stat.h 	sys/syscall.h 	sys/sysctl.h 	sys/sysmacros.h 	sys/time.h 	sys/types.h 	sys/un.h 	sys/wait.h
>  do :
>    as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
>  ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
> diff --git a/configure.in b/configure.in
> index 3c0d64f..e881204 100644
> --- a/configure.in
> +++ b/configure.in
> @@ -941,6 +941,7 @@ AC_CHECK_HEADERS(m4_flatten([
>  	sys/sockio.h
>  	sys/stat.h
>  	sys/syscall.h
> +	sys/sysctl.h
>  	sys/sysmacros.h
>  	sys/time.h
>  	sys/types.h
> diff --git a/e2fsck/Makefile.in b/e2fsck/Makefile.in
> index c40b188..3a9d7b5 100644
> --- a/e2fsck/Makefile.in
> +++ b/e2fsck/Makefile.in
> @@ -62,7 +62,7 @@ OBJS= dict.o unix.o e2fsck.o super.o pass1.o pass1b.o pass2.o \
>  	pass3.o pass4.o pass5.o journal.o badblocks.o util.o dirinfo.o \
>  	dx_dirinfo.o ehandler.o problem.o message.o quota.o recovery.o \
>  	region.o revoke.o ea_refcount.o rehash.o profile.o prof_err.o \
> -	logfile.o sigcatcher.o $(MTRACE_OBJ)
> +	logfile.o sigcatcher.o readahead.o $(MTRACE_OBJ)
>  
>  PROFILED_OBJS= profiled/dict.o profiled/unix.o profiled/e2fsck.o \
>  	profiled/super.o profiled/pass1.o profiled/pass1b.o \
> @@ -73,7 +73,7 @@ PROFILED_OBJS= profiled/dict.o profiled/unix.o profiled/e2fsck.o \
>  	profiled/recovery.o profiled/region.o profiled/revoke.o \
>  	profiled/ea_refcount.o profiled/rehash.o profiled/profile.o \
>  	profiled/prof_err.o profiled/logfile.o \
> -	profiled/sigcatcher.o
> +	profiled/sigcatcher.o profiled/readahead.o
>  
>  SRCS= $(srcdir)/e2fsck.c \
>  	$(srcdir)/dict.c \
> @@ -97,6 +97,7 @@ SRCS= $(srcdir)/e2fsck.c \
>  	$(srcdir)/message.c \
>  	$(srcdir)/ea_refcount.c \
>  	$(srcdir)/rehash.c \
> +	$(srcdir)/readahead.c \
>  	$(srcdir)/region.c \
>  	$(srcdir)/profile.c \
>  	$(srcdir)/sigcatcher.c \
> @@ -527,3 +528,6 @@ quota.o: $(srcdir)/quota.c $(top_builddir)/lib/config.h \
>   $(srcdir)/profile.h prof_err.h $(top_srcdir)/lib/quota/quotaio.h \
>   $(top_srcdir)/lib/quota/dqblk_v2.h $(top_srcdir)/lib/quota/quotaio_tree.h \
>   $(top_srcdir)/lib/../e2fsck/dict.h $(srcdir)/problem.h
> +readahead.o: $(srcdir)/readahead.c $(top_builddir)/lib/config.h \
> + $(top_srcdir)/lib/ext2fs/ext2fs.h $(top_srcdir)/lib/ext2fs/ext2_fs.h \
> + $(top_builddir)/lib/ext2fs/ext2_err.h $(srcdir)/e2fsck.h
> diff --git a/e2fsck/e2fsck.h b/e2fsck/e2fsck.h
> index 8f16218..ead546e 100644
> --- a/e2fsck/e2fsck.h
> +++ b/e2fsck/e2fsck.h
> @@ -490,6 +490,23 @@ extern ext2_ino_t e2fsck_get_lost_and_found(e2fsck_t ctx, int fix);
>  extern errcode_t e2fsck_adjust_inode_count(e2fsck_t ctx, ext2_ino_t ino,
>  					   int adj);
>  
> +/* readahead.c */
> +#define E2FSCK_READA_SUPER	(0x01)
> +#define E2FSCK_READA_GDT	(0x02)
> +#define E2FSCK_READA_BBITMAP	(0x04)
> +#define E2FSCK_READA_IBITMAP	(0x08)
> +#define E2FSCK_READA_ITABLE	(0x10)
> +#define E2FSCK_READA_ALL_FLAGS	(0x1F)
> +errcode_t e2fsck_readahead(ext2_filsys fs, int flags, dgrp_t start,
> +			   dgrp_t ngroups);
> +#define E2FSCK_RA_DBLIST_IGNORE_BLOCKCNT	(0x01)
> +#define E2FSCK_RA_DBLIST_ALL_FLAGS		(0x01)
> +errcode_t e2fsck_readahead_dblist(ext2_filsys fs, int flags,
> +				  ext2_dblist dblist,
> +				  unsigned long long start,
> +				  unsigned long long count);
> +int e2fsck_can_readahead(ext2_filsys fs);
> +unsigned long long e2fsck_guess_readahead(ext2_filsys fs);
>  
>  /* region.c */
>  extern region_t region_create(region_addr_t min, region_addr_t max);
> @@ -579,6 +596,7 @@ extern errcode_t e2fsck_allocate_subcluster_bitmap(ext2_filsys fs,
>  						   int default_type,
>  						   const char *profile_name,
>  						   ext2fs_block_bitmap *ret);
> +unsigned long long get_memory_size(void);
>  
>  /* unix.c */
>  extern void e2fsck_clear_progbar(e2fsck_t ctx);
> diff --git a/e2fsck/readahead.c b/e2fsck/readahead.c
> new file mode 100644
> index 0000000..0395975
> --- /dev/null
> +++ b/e2fsck/readahead.c
> @@ -0,0 +1,213 @@
> +/*
> + * readahead.c -- Prefetch filesystem metadata to speed up fsck.
> + *
> + * Copyright (C) 2014 Oracle.
> + *
> + * %Begin-Header%
> + * This file may be redistributed under the terms of the GNU Library
> + * General Public License, version 2.
> + * %End-Header%
> + */
> +
> +#include "config.h"
> +#include <string.h>
> +
> +#include "e2fsck.h"
> +
> +#undef DEBUG
> +
> +#ifdef DEBUG
> +# define dbg_printf(f, a...)  do {printf(f, ## a); fflush(stdout); } while (0)
> +#else
> +# define dbg_printf(f, a...)
> +#endif
> +
> +struct read_dblist {
> +	errcode_t err;
> +	blk64_t run_start;
> +	blk64_t run_len;
> +	int flags;
> +};
> +
> +static EXT2_QSORT_TYPE readahead_dir_block_cmp(const void *a, const void *b)
> +{
> +	const struct ext2_db_entry2 *db_a =
> +		(const struct ext2_db_entry2 *) a;
> +	const struct ext2_db_entry2 *db_b =
> +		(const struct ext2_db_entry2 *) b;
> +
> +	return (int) (db_a->blk - db_b->blk);
> +}
> +
> +static int readahead_dir_block(ext2_filsys fs, struct ext2_db_entry2 *db,
> +			       void *priv_data)
> +{
> +	struct read_dblist *pr = priv_data;
> +	e2_blkcnt_t count = (pr->flags & E2FSCK_RA_DBLIST_IGNORE_BLOCKCNT ?
> +			     1 : db->blockcnt);
> +
> +	if (!pr->run_len || db->blk != pr->run_start + pr->run_len) {
> +		if (pr->run_len) {
> +			pr->err = io_channel_cache_readahead(fs->io,
> +							     pr->run_start,
> +							     pr->run_len);
> +			dbg_printf("readahead start=%llu len=%llu err=%d\n",
> +				   pr->run_start, pr->run_len,
> +				   (int)pr->err);
> +		}
> +		pr->run_start = db->blk;
> +		pr->run_len = 0;
> +	}
> +	pr->run_len += count;
> +
> +	return pr->err ? DBLIST_ABORT : 0;
> +}
> +
> +errcode_t e2fsck_readahead_dblist(ext2_filsys fs, int flags,
> +				  ext2_dblist dblist,
> +				  unsigned long long start,
> +				  unsigned long long count)
> +{
> +	errcode_t err;
> +	struct read_dblist pr;
> +
> +	dbg_printf("%s: flags=0x%x\n", __func__, flags);
> +	if (flags & ~E2FSCK_RA_DBLIST_ALL_FLAGS)
> +		return EXT2_ET_INVALID_ARGUMENT;
> +
> +	memset(&pr, 0, sizeof(pr));
> +	pr.flags = flags;
> +	err = ext2fs_dblist_iterate3(dblist, readahead_dir_block, start,
> +				     count, &pr);
> +	if (pr.err)
> +		return pr.err;
> +	if (err)
> +		return err;
> +
> +	if (pr.run_len)
> +		err = io_channel_cache_readahead(fs->io, pr.run_start,
> +						 pr.run_len);
> +
> +	return err;
> +}
> +
> +errcode_t e2fsck_readahead(ext2_filsys fs, int flags, dgrp_t start,
> +			   dgrp_t ngroups)
> +{
> +	blk64_t		super, old_gdt, new_gdt;
> +	blk_t		blocks;
> +	dgrp_t		i;
> +	ext2_dblist	dblist;
> +	dgrp_t		end = start + ngroups;
> +	errcode_t	err = 0;
> +
> +	dbg_printf("%s: flags=0x%x start=%d groups=%d\n", __func__, flags,
> +		   start, ngroups);
> +	if (flags & ~E2FSCK_READA_ALL_FLAGS)
> +		return EXT2_ET_INVALID_ARGUMENT;
> +
> +	if (end > fs->group_desc_count)
> +		end = fs->group_desc_count;
> +
> +	if (flags == 0)
> +		return 0;
> +
> +	err = ext2fs_init_dblist(fs, &dblist);

It turns out that each of the calls to ext2fs_resize_mem in the
ext2fs_add_dir_block2() function is costing us ~2ms for each call to this
function.  I'll add a new ext2fs_init_dblist() APi that lets us specify the
initial size of the list.  This seems to reduce the fsck runtime by a few more
seconds.

--D

> +	if (err)
> +		return err;
> +
> +	for (i = start; i < end; i++) {
> +		err = ext2fs_super_and_bgd_loc2(fs, i, &super, &old_gdt,
> +						&new_gdt, &blocks);
> +		if (err)
> +			break;
> +
> +		if (flags & E2FSCK_READA_SUPER) {
> +			err = ext2fs_add_dir_block2(dblist, 0, super, 0);
> +			if (err)
> +				break;
> +		}
> +
> +		if (flags & E2FSCK_READA_GDT) {
> +			if (old_gdt)
> +				err = ext2fs_add_dir_block2(dblist, 0, old_gdt,
> +							    blocks);
> +			else if (new_gdt)
> +				err = ext2fs_add_dir_block2(dblist, 0, new_gdt,
> +							    blocks);
> +			else
> +				err = 0;
> +			if (err)
> +				break;
> +		}
> +
> +		if ((flags & E2FSCK_READA_BBITMAP) &&
> +		    !ext2fs_bg_flags_test(fs, i, EXT2_BG_BLOCK_UNINIT) &&
> +		    ext2fs_bg_free_blocks_count(fs, i) <
> +				fs->super->s_blocks_per_group) {
> +			super = ext2fs_block_bitmap_loc(fs, i);
> +			err = ext2fs_add_dir_block2(dblist, 0, super, 1);
> +			if (err)
> +				break;
> +		}
> +
> +		if ((flags & E2FSCK_READA_IBITMAP) &&
> +		    !ext2fs_bg_flags_test(fs, i, EXT2_BG_INODE_UNINIT) &&
> +		    ext2fs_bg_free_inodes_count(fs, i) <
> +				fs->super->s_inodes_per_group) {
> +			super = ext2fs_inode_bitmap_loc(fs, i);
> +			err = ext2fs_add_dir_block2(dblist, 0, super, 1);
> +			if (err)
> +				break;
> +		}
> +
> +		if ((flags & E2FSCK_READA_ITABLE) &&
> +		    ext2fs_bg_free_inodes_count(fs, i) <
> +				fs->super->s_inodes_per_group) {
> +			super = ext2fs_inode_table_loc(fs, i);
> +			blocks = fs->inode_blocks_per_group -
> +				 (ext2fs_bg_itable_unused(fs, i) *
> +				  EXT2_INODE_SIZE(fs->super) / fs->blocksize);
> +			err = ext2fs_add_dir_block2(dblist, 0, super, blocks);
> +			if (err)
> +				break;
> +		}
> +	}
> +
> +	if (!err) {
> +		ext2fs_dblist_sort2(dblist, readahead_dir_block_cmp);
> +		err = e2fsck_readahead_dblist(fs, 0, dblist, 0,
> +					      ext2fs_dblist_count2(dblist));
> +	}
> +
> +	ext2fs_free_dblist(dblist);
> +	return err;
> +}
> +
> +int e2fsck_can_readahead(ext2_filsys fs)
> +{
> +	errcode_t err;
> +
> +	err = io_channel_cache_readahead(fs->io, 0, 1);
> +	dbg_printf("%s: supp=%d\n", __func__, err != EXT2_ET_OP_NOT_SUPPORTED);
> +	return err != EXT2_ET_OP_NOT_SUPPORTED;
> +}
> +
> +unsigned long long e2fsck_guess_readahead(ext2_filsys fs)
> +{
> +	unsigned long long guess;
> +
> +	/*
> +	 * The optimal readahead sizes were experimentally determined by
> +	 * djwong in August 2014.  Setting the RA size to one block group's
> +	 * worth of inode table blocks seems to yield the largest reductions
> +	 * in e2fsck runtime.
> +	 */
> +	guess = fs->blocksize * fs->inode_blocks_per_group;
> +
> +	/* Disable RA if it'd use more 1/100th of RAM. */
> +	if (get_memory_size() > (guess * 100))
> +		return guess / 1024;
> +
> +	return 0;
> +}
> diff --git a/e2fsck/util.c b/e2fsck/util.c
> index 8237328..74f20062 100644
> --- a/e2fsck/util.c
> +++ b/e2fsck/util.c
> @@ -37,6 +37,10 @@
>  #include <errno.h>
>  #endif
>  
> +#ifdef HAVE_SYS_SYSCTL_H
> +#include <sys/sysctl.h>
> +#endif
> +
>  #include "e2fsck.h"
>  
>  extern e2fsck_t e2fsck_global_ctx;   /* Try your very best not to use this! */
> @@ -848,3 +852,50 @@ errcode_t e2fsck_allocate_subcluster_bitmap(ext2_filsys fs, const char *descr,
>  	fs->default_bitmap_type = save_type;
>  	return retval;
>  }
> +
> +/* Return memory size in bytes */
> +unsigned long long get_memory_size(void)
> +{
> +#if defined(_SC_PHYS_PAGES)
> +# if defined(_SC_PAGESIZE)
> +	return (unsigned long long)sysconf(_SC_PHYS_PAGES) *
> +	       (unsigned long long)sysconf(_SC_PAGESIZE);
> +# elif defined(_SC_PAGE_SIZE)
> +	return (unsigned long long)sysconf(_SC_PHYS_PAGES) *
> +	       (unsigned long long)sysconf(_SC_PAGE_SIZE);
> +# endif
> +#elif defined(CTL_HW)
> +# if (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64))
> +#  define CTL_HW_INT64
> +# elif (defined(HW_PHYSMEM) || defined(HW_REALMEM))
> +#  define CTL_HW_UINT
> +# endif
> +	int mib[2];
> +
> +	mib[0] = CTL_HW;
> +# if defined(HW_MEMSIZE)
> +	mib[1] = HW_MEMSIZE;
> +# elif defined(HW_PHYSMEM64)
> +	mib[1] = HW_PHYSMEM64;
> +# elif defined(HW_REALMEM)
> +	mib[1] = HW_REALMEM;
> +# elif defined(HW_PYSMEM)
> +	mib[1] = HW_PHYSMEM;
> +# endif
> +# if defined(CTL_HW_INT64)
> +	unsigned long long size = 0;
> +# elif defined(CTL_HW_UINT)
> +	unsigned int size = 0;
> +# endif
> +# if defined(CTL_HW_INT64) || defined(CTL_HW_UINT)
> +	size_t len = sizeof(size);
> +
> +	if (sysctl(mib, 2, &size, &len, NULL, 0) == 0)
> +		return (unsigned long long)size;
> +# endif
> +	return 0;
> +#else
> +# warning "Don't know how to detect memory on your platform?"
> +	return 0;
> +#endif
> +}
> diff --git a/lib/config.h.in b/lib/config.h.in
> index 1d2382b..b598d1e 100644
> --- a/lib/config.h.in
> +++ b/lib/config.h.in
> @@ -500,6 +500,9 @@
>  /* Define to 1 if you have the <sys/syscall.h> header file. */
>  #undef HAVE_SYS_SYSCALL_H
>  
> +/* Define to 1 if you have the <sys/sysctl.h> header file. */
> +#undef HAVE_SYS_SYSCTL_H
> +
>  /* Define to 1 if you have the <sys/sysmacros.h> header file. */
>  #undef HAVE_SYS_SYSMACROS_H
>  
> diff --git a/lib/ext2fs/dblist.c b/lib/ext2fs/dblist.c
> index 942c4f0..bbdb221 100644
> --- a/lib/ext2fs/dblist.c
> +++ b/lib/ext2fs/dblist.c
> @@ -194,20 +194,25 @@ void ext2fs_dblist_sort2(ext2_dblist dblist,
>  /*
>   * This function iterates over the directory block list
>   */
> -errcode_t ext2fs_dblist_iterate2(ext2_dblist dblist,
> +errcode_t ext2fs_dblist_iterate3(ext2_dblist dblist,
>  				 int (*func)(ext2_filsys fs,
>  					     struct ext2_db_entry2 *db_info,
>  					     void	*priv_data),
> +				 unsigned long long start,
> +				 unsigned long long count,
>  				 void *priv_data)
>  {
> -	unsigned long long	i;
> +	unsigned long long	i, end;
>  	int		ret;
>  
>  	EXT2_CHECK_MAGIC(dblist, EXT2_ET_MAGIC_DBLIST);
>  
> +	end = start + count;
>  	if (!dblist->sorted)
>  		ext2fs_dblist_sort2(dblist, 0);
> -	for (i=0; i < dblist->count; i++) {
> +	if (end > dblist->count)
> +		end = dblist->count;
> +	for (i = start; i < end; i++) {
>  		ret = (*func)(dblist->fs, &dblist->list[i], priv_data);
>  		if (ret & DBLIST_ABORT)
>  			return 0;
> @@ -215,6 +220,16 @@ errcode_t ext2fs_dblist_iterate2(ext2_dblist dblist,
>  	return 0;
>  }
>  
> +errcode_t ext2fs_dblist_iterate2(ext2_dblist dblist,
> +				 int (*func)(ext2_filsys fs,
> +					     struct ext2_db_entry2 *db_info,
> +					     void	*priv_data),
> +				 void *priv_data)
> +{
> +	return ext2fs_dblist_iterate3(dblist, func, 0, dblist->count,
> +				      priv_data);
> +}
> +
>  static EXT2_QSORT_TYPE dir_block_cmp2(const void *a, const void *b)
>  {
>  	const struct ext2_db_entry2 *db_a =
> diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
> index b4a9f84..04a1f4a 100644
> --- a/lib/ext2fs/ext2fs.h
> +++ b/lib/ext2fs/ext2fs.h
> @@ -1052,11 +1052,17 @@ extern void ext2fs_dblist_sort2(ext2_dblist dblist,
>  extern errcode_t ext2fs_dblist_iterate(ext2_dblist dblist,
>  	int (*func)(ext2_filsys fs, struct ext2_db_entry *db_info,
>  		    void	*priv_data),
> -       void *priv_data);
> +	void *priv_data);
>  extern errcode_t ext2fs_dblist_iterate2(ext2_dblist dblist,
>  	int (*func)(ext2_filsys fs, struct ext2_db_entry2 *db_info,
>  		    void	*priv_data),
> -       void *priv_data);
> +	void *priv_data);
> +extern errcode_t ext2fs_dblist_iterate3(ext2_dblist dblist,
> +	int (*func)(ext2_filsys fs, struct ext2_db_entry2 *db_info,
> +		    void	*priv_data),
> +	unsigned long long start,
> +	unsigned long long count,
> +	void *priv_data);
>  extern errcode_t ext2fs_set_dir_block(ext2_dblist dblist, ext2_ino_t ino,
>  				      blk_t blk, int blockcnt);
>  extern errcode_t ext2fs_set_dir_block2(ext2_dblist dblist, ext2_ino_t ino,
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o Aug. 11, 2014, 6:24 a.m. UTC | #2
On Sun, Aug 10, 2014 at 10:21:51PM -0700, Darrick J. Wong wrote:
> 
> It turns out that each of the calls to ext2fs_resize_mem in the
> ext2fs_add_dir_block2() function is costing us ~2ms for each call to this
> function.  I'll add a new ext2fs_init_dblist() APi that lets us specify the
> initial size of the list.  This seems to reduce the fsck runtime by a few more
> seconds.

I suspect dblist is the wrong abstraction to use here.  Since the
blocks we want to read ahead are generally going to be contiguous, why
not use a rbtree bitmap, setting the blocks that should be subject to
readahead, and then use the find_first_set() function to iterate over
the bitmap?  Or if you want to be even more efficient, create an
interator function which takes a bitmap and returns blocks that are
set in the bitmap, one by one.

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick Wong Aug. 11, 2014, 6:31 a.m. UTC | #3
On Mon, Aug 11, 2014 at 02:24:15AM -0400, Theodore Ts'o wrote:
> On Sun, Aug 10, 2014 at 10:21:51PM -0700, Darrick J. Wong wrote:
> > 
> > It turns out that each of the calls to ext2fs_resize_mem in the
> > ext2fs_add_dir_block2() function is costing us ~2ms for each call to this
> > function.  I'll add a new ext2fs_init_dblist() APi that lets us specify the
> > initial size of the list.  This seems to reduce the fsck runtime by a few more
> > seconds.
> 
> I suspect dblist is the wrong abstraction to use here.  Since the
> blocks we want to read ahead are generally going to be contiguous, why
> not use a rbtree bitmap, setting the blocks that should be subject to
> readahead, and then use the find_first_set() function to iterate over
> the bitmap?  Or if you want to be even more efficient, create an
> interator function which takes a bitmap and returns blocks that are
> set in the bitmap, one by one.

Hmm, I'll try that, though I'm almost tempted just to issue io_cache_readahead
calls directly from the for loop.  The only reason I'm using the list here at
all is to handle the case of reading multiple groups in a flexbg (pass5 RA).

--D
> 
> 						- Ted
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o Aug. 11, 2014, 2:34 p.m. UTC | #4
On Sun, Aug 10, 2014 at 11:31:20PM -0700, Darrick J. Wong wrote:
> 
> Hmm, I'll try that, though I'm almost tempted just to issue io_cache_readahead
> calls directly from the for loop.  The only reason I'm using the list here at
> all is to handle the case of reading multiple groups in a flexbg (pass5 RA).

In that case you could just use an accumulator for each of the block
and inode bitmaps, and so long as the next group's block and inode
bitmaps are contiguous with the previous one, you can bump a counter.
Once you reach a discontiguous bitmap block, you can issue a single
readahead request for N blocks starting at block B.  That way you only
issue a single syscall, instead of one for every single block, and you
don't have the overhead involved with storing the list of blocks and
then sorting them.

					- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick Wong Aug. 11, 2014, 6:05 p.m. UTC | #5
On Mon, Aug 11, 2014 at 10:34:23AM -0400, Theodore Ts'o wrote:
> On Sun, Aug 10, 2014 at 11:31:20PM -0700, Darrick J. Wong wrote:
> > 
> > Hmm, I'll try that, though I'm almost tempted just to issue io_cache_readahead
> > calls directly from the for loop.  The only reason I'm using the list here at
> > all is to handle the case of reading multiple groups in a flexbg (pass5 RA).
> 
> In that case you could just use an accumulator for each of the block
> and inode bitmaps, and so long as the next group's block and inode
> bitmaps are contiguous with the previous one, you can bump a counter.
> Once you reach a discontiguous bitmap block, you can issue a single
> readahead request for N blocks starting at block B.  That way you only
> issue a single syscall, instead of one for every single block, and you
> don't have the overhead involved with storing the list of blocks and
> then sorting them.

Using the bitmap turns out to be pretty quick (~130us to start RA for 4 groups
vs. ~70us per group if I issue the RA directly).  Each fadvise call seems to
cost us ~1ms, so I'll keep using the bitmap to minimize the number of fadvise
calls, since it's also a lot less code.

--D

> 
> 					- Ted
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o Aug. 11, 2014, 6:32 p.m. UTC | #6
On Mon, Aug 11, 2014 at 11:05:09AM -0700, Darrick J. Wong wrote:
> 
> Using the bitmap turns out to be pretty quick (~130us to start RA for 4 groups
> vs. ~70us per group if I issue the RA directly).  Each fadvise call seems to
> cost us ~1ms, so I'll keep using the bitmap to minimize the number of fadvise
> calls, since it's also a lot less code.

4 groups?  Since the default flex_bg size is 16 block groups, I would
have expected that you would want to start RA every 16 groups.

(And BTW, I've been wondering whether we should increase the flex_bg
size for bigger file systems.  By the time we get to 4TB disks, Having
a flex_bg every 2GB seems a little small.)

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick Wong Aug. 11, 2014, 6:55 p.m. UTC | #7
On Mon, Aug 11, 2014 at 02:32:58PM -0400, Theodore Ts'o wrote:
> On Mon, Aug 11, 2014 at 11:05:09AM -0700, Darrick J. Wong wrote:
> > 
> > Using the bitmap turns out to be pretty quick (~130us to start RA for 4 groups
> > vs. ~70us per group if I issue the RA directly).  Each fadvise call seems to
> > cost us ~1ms, so I'll keep using the bitmap to minimize the number of fadvise
> > calls, since it's also a lot less code.
> 
> 4 groups?  Since the default flex_bg size is 16 block groups, I would
> have expected that you would want to start RA every 16 groups.

I was expecting 16 groups (32M readahead) to win, but as the observations in my
spreadsheet show, 2MB tends to win.  I _think_ the reason is that if we
encounter indirect map blocks or ETB blocks, they tend to be fairly close to
the file blocks in the block group, and if we're trying to do a large readahead
at the same time, we end up with a largeish seek penalty (half the flexbg on
average) for every ETB/map block.

I figured out what was going on with the 1TB SSD -- it has a huge RAM cache big
enough to store most of the metadata.  At that point, reads are essentially
free, but readahead costs us ~1ms per fadvise call.  If you use a RA buffer
that's big enough that there aren't many fadvise calls then you still come out
ahead (ditto if you shove the RA into a separate thread) but otherwise the
fadvise calls add up, badly.

Actually, I'd considered using a default of flexbg_size * itable_size, but (a)
the USB results are pretty bad for 32M v. 2M, and (b) I was thinking that 2MB
of readahead might be small enough that we could enable it by default without
having to worry about the mal-effects of parallel e2fsck runs.

A logical next step might be to do ETB/block map readahead, but let's keep it
simple for now.  I should have time to update the spreadsheet to reflect
performance of the new bitmap code while I go mess with fixing the jbd2
problems.

> (And BTW, I've been wondering whether we should increase the flex_bg
> size for bigger file systems.  By the time we get to 4TB disks, Having
> a flex_bg every 2GB seems a little small.)

:)

--D
> 
> 						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o Aug. 11, 2014, 8:10 p.m. UTC | #8
On Mon, Aug 11, 2014 at 11:55:32AM -0700, Darrick J. Wong wrote:
> I was expecting 16 groups (32M readahead) to win, but as the observations in my
> spreadsheet show, 2MB tends to win.  I _think_ the reason is that if we
> encounter indirect map blocks or ETB blocks, they tend to be fairly close to
> the file blocks in the block group, and if we're trying to do a large readahead
> at the same time, we end up with a largeish seek penalty (half the flexbg on
> average) for every ETB/map block.

Hmm, that might be an argument for not trying to increase the flex_bg
size, since we want to keep seek distances within a flex_bg to be
dominated by settling time, and not by the track-to-track
accelleration/coasting/deaccelleration time.

> I figured out what was going on with the 1TB SSD -- it has a huge RAM cache big
> enough to store most of the metadata.  At that point, reads are essentially
> free, but readahead costs us ~1ms per fadvise call. 

Do we understand why fadvise() takes 1ms?   Is that something we can fix?

And readahead(2) was even worse, right?

							- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick Wong Aug. 11, 2014, 8:50 p.m. UTC | #9
On Mon, Aug 11, 2014 at 04:10:30PM -0400, Theodore Ts'o wrote:
> On Mon, Aug 11, 2014 at 11:55:32AM -0700, Darrick J. Wong wrote:
> > I was expecting 16 groups (32M readahead) to win, but as the observations in my
> > spreadsheet show, 2MB tends to win.  I _think_ the reason is that if we
> > encounter indirect map blocks or ETB blocks, they tend to be fairly close to
> > the file blocks in the block group, and if we're trying to do a large readahead
> > at the same time, we end up with a largeish seek penalty (half the flexbg on
> > average) for every ETB/map block.
> 
> Hmm, that might be an argument for not trying to increase the flex_bg
> size, since we want to keep seek distances within a flex_bg to be
> dominated by settling time, and not by the track-to-track
> accelleration/coasting/deaccelleration time.

It might not be too horrible of a regression, since the distance between tracks
has gotten shorter and cylinders themselves have gotten bigger.  I suppose
you'd have to test a variety of flexbg sizes against a disk from, say, 5 years
ago.  If you know the size of the files you'll be storing at mkfs time (such as
with the mk_hugefiles.c options) then increasing flexbg size is probably ok to
avoid fragmenting.

But yes, I was sort of enjoying how stuff within a flexbg gets (sort of) faster
as disks get bigger. :)

> > I figured out what was going on with the 1TB SSD -- it has a huge RAM cache big
> > enough to store most of the metadata.  At that point, reads are essentially
> > free, but readahead costs us ~1ms per fadvise call. 
> 
> Do we understand why fadvise() takes 1ms?   Is that something we can fix?
> 
> And readahead(2) was even worse, right?

From the readahead(2) manpage:

"readahead() blocks until the specified data has been read."

The fadvise time is pretty consistently 1ms, but with readahead you have to
wait for it to read everything off the disk.  That's fine for threaded
readahead, but for our single-thread readahead it's not much better than
regular blocking reads.  Letting the kernel do the readahead in the background
is way faster.

I don't know why fadvise takes so long.  I'll ftrace it to see where it goes.

--D
> 
> 							- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/configure b/configure
index 8ad1408..71778e1 100755
--- a/configure
+++ b/configure
@@ -12404,7 +12404,7 @@  fi
 done
 
 fi
-for ac_header in  	dirent.h 	errno.h 	execinfo.h 	getopt.h 	malloc.h 	mntent.h 	paths.h 	semaphore.h 	setjmp.h 	signal.h 	stdarg.h 	stdint.h 	stdlib.h 	termios.h 	termio.h 	unistd.h 	utime.h 	attr/xattr.h 	linux/falloc.h 	linux/fd.h 	linux/major.h 	linux/loop.h 	net/if_dl.h 	netinet/in.h 	sys/disklabel.h 	sys/disk.h 	sys/file.h 	sys/ioctl.h 	sys/mkdev.h 	sys/mman.h 	sys/mount.h 	sys/prctl.h 	sys/resource.h 	sys/select.h 	sys/socket.h 	sys/sockio.h 	sys/stat.h 	sys/syscall.h 	sys/sysmacros.h 	sys/time.h 	sys/types.h 	sys/un.h 	sys/wait.h
+for ac_header in  	dirent.h 	errno.h 	execinfo.h 	getopt.h 	malloc.h 	mntent.h 	paths.h 	semaphore.h 	setjmp.h 	signal.h 	stdarg.h 	stdint.h 	stdlib.h 	termios.h 	termio.h 	unistd.h 	utime.h 	attr/xattr.h 	linux/falloc.h 	linux/fd.h 	linux/major.h 	linux/loop.h 	net/if_dl.h 	netinet/in.h 	sys/disklabel.h 	sys/disk.h 	sys/file.h 	sys/ioctl.h 	sys/mkdev.h 	sys/mman.h 	sys/mount.h 	sys/prctl.h 	sys/resource.h 	sys/select.h 	sys/socket.h 	sys/sockio.h 	sys/stat.h 	sys/syscall.h 	sys/sysctl.h 	sys/sysmacros.h 	sys/time.h 	sys/types.h 	sys/un.h 	sys/wait.h
 do :
   as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
 ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
diff --git a/configure.in b/configure.in
index 3c0d64f..e881204 100644
--- a/configure.in
+++ b/configure.in
@@ -941,6 +941,7 @@  AC_CHECK_HEADERS(m4_flatten([
 	sys/sockio.h
 	sys/stat.h
 	sys/syscall.h
+	sys/sysctl.h
 	sys/sysmacros.h
 	sys/time.h
 	sys/types.h
diff --git a/e2fsck/Makefile.in b/e2fsck/Makefile.in
index c40b188..3a9d7b5 100644
--- a/e2fsck/Makefile.in
+++ b/e2fsck/Makefile.in
@@ -62,7 +62,7 @@  OBJS= dict.o unix.o e2fsck.o super.o pass1.o pass1b.o pass2.o \
 	pass3.o pass4.o pass5.o journal.o badblocks.o util.o dirinfo.o \
 	dx_dirinfo.o ehandler.o problem.o message.o quota.o recovery.o \
 	region.o revoke.o ea_refcount.o rehash.o profile.o prof_err.o \
-	logfile.o sigcatcher.o $(MTRACE_OBJ)
+	logfile.o sigcatcher.o readahead.o $(MTRACE_OBJ)
 
 PROFILED_OBJS= profiled/dict.o profiled/unix.o profiled/e2fsck.o \
 	profiled/super.o profiled/pass1.o profiled/pass1b.o \
@@ -73,7 +73,7 @@  PROFILED_OBJS= profiled/dict.o profiled/unix.o profiled/e2fsck.o \
 	profiled/recovery.o profiled/region.o profiled/revoke.o \
 	profiled/ea_refcount.o profiled/rehash.o profiled/profile.o \
 	profiled/prof_err.o profiled/logfile.o \
-	profiled/sigcatcher.o
+	profiled/sigcatcher.o profiled/readahead.o
 
 SRCS= $(srcdir)/e2fsck.c \
 	$(srcdir)/dict.c \
@@ -97,6 +97,7 @@  SRCS= $(srcdir)/e2fsck.c \
 	$(srcdir)/message.c \
 	$(srcdir)/ea_refcount.c \
 	$(srcdir)/rehash.c \
+	$(srcdir)/readahead.c \
 	$(srcdir)/region.c \
 	$(srcdir)/profile.c \
 	$(srcdir)/sigcatcher.c \
@@ -527,3 +528,6 @@  quota.o: $(srcdir)/quota.c $(top_builddir)/lib/config.h \
  $(srcdir)/profile.h prof_err.h $(top_srcdir)/lib/quota/quotaio.h \
  $(top_srcdir)/lib/quota/dqblk_v2.h $(top_srcdir)/lib/quota/quotaio_tree.h \
  $(top_srcdir)/lib/../e2fsck/dict.h $(srcdir)/problem.h
+readahead.o: $(srcdir)/readahead.c $(top_builddir)/lib/config.h \
+ $(top_srcdir)/lib/ext2fs/ext2fs.h $(top_srcdir)/lib/ext2fs/ext2_fs.h \
+ $(top_builddir)/lib/ext2fs/ext2_err.h $(srcdir)/e2fsck.h
diff --git a/e2fsck/e2fsck.h b/e2fsck/e2fsck.h
index 8f16218..ead546e 100644
--- a/e2fsck/e2fsck.h
+++ b/e2fsck/e2fsck.h
@@ -490,6 +490,23 @@  extern ext2_ino_t e2fsck_get_lost_and_found(e2fsck_t ctx, int fix);
 extern errcode_t e2fsck_adjust_inode_count(e2fsck_t ctx, ext2_ino_t ino,
 					   int adj);
 
+/* readahead.c */
+#define E2FSCK_READA_SUPER	(0x01)
+#define E2FSCK_READA_GDT	(0x02)
+#define E2FSCK_READA_BBITMAP	(0x04)
+#define E2FSCK_READA_IBITMAP	(0x08)
+#define E2FSCK_READA_ITABLE	(0x10)
+#define E2FSCK_READA_ALL_FLAGS	(0x1F)
+errcode_t e2fsck_readahead(ext2_filsys fs, int flags, dgrp_t start,
+			   dgrp_t ngroups);
+#define E2FSCK_RA_DBLIST_IGNORE_BLOCKCNT	(0x01)
+#define E2FSCK_RA_DBLIST_ALL_FLAGS		(0x01)
+errcode_t e2fsck_readahead_dblist(ext2_filsys fs, int flags,
+				  ext2_dblist dblist,
+				  unsigned long long start,
+				  unsigned long long count);
+int e2fsck_can_readahead(ext2_filsys fs);
+unsigned long long e2fsck_guess_readahead(ext2_filsys fs);
 
 /* region.c */
 extern region_t region_create(region_addr_t min, region_addr_t max);
@@ -579,6 +596,7 @@  extern errcode_t e2fsck_allocate_subcluster_bitmap(ext2_filsys fs,
 						   int default_type,
 						   const char *profile_name,
 						   ext2fs_block_bitmap *ret);
+unsigned long long get_memory_size(void);
 
 /* unix.c */
 extern void e2fsck_clear_progbar(e2fsck_t ctx);
diff --git a/e2fsck/readahead.c b/e2fsck/readahead.c
new file mode 100644
index 0000000..0395975
--- /dev/null
+++ b/e2fsck/readahead.c
@@ -0,0 +1,213 @@ 
+/*
+ * readahead.c -- Prefetch filesystem metadata to speed up fsck.
+ *
+ * Copyright (C) 2014 Oracle.
+ *
+ * %Begin-Header%
+ * This file may be redistributed under the terms of the GNU Library
+ * General Public License, version 2.
+ * %End-Header%
+ */
+
+#include "config.h"
+#include <string.h>
+
+#include "e2fsck.h"
+
+#undef DEBUG
+
+#ifdef DEBUG
+# define dbg_printf(f, a...)  do {printf(f, ## a); fflush(stdout); } while (0)
+#else
+# define dbg_printf(f, a...)
+#endif
+
+struct read_dblist {
+	errcode_t err;
+	blk64_t run_start;
+	blk64_t run_len;
+	int flags;
+};
+
+static EXT2_QSORT_TYPE readahead_dir_block_cmp(const void *a, const void *b)
+{
+	const struct ext2_db_entry2 *db_a =
+		(const struct ext2_db_entry2 *) a;
+	const struct ext2_db_entry2 *db_b =
+		(const struct ext2_db_entry2 *) b;
+
+	return (int) (db_a->blk - db_b->blk);
+}
+
+static int readahead_dir_block(ext2_filsys fs, struct ext2_db_entry2 *db,
+			       void *priv_data)
+{
+	struct read_dblist *pr = priv_data;
+	e2_blkcnt_t count = (pr->flags & E2FSCK_RA_DBLIST_IGNORE_BLOCKCNT ?
+			     1 : db->blockcnt);
+
+	if (!pr->run_len || db->blk != pr->run_start + pr->run_len) {
+		if (pr->run_len) {
+			pr->err = io_channel_cache_readahead(fs->io,
+							     pr->run_start,
+							     pr->run_len);
+			dbg_printf("readahead start=%llu len=%llu err=%d\n",
+				   pr->run_start, pr->run_len,
+				   (int)pr->err);
+		}
+		pr->run_start = db->blk;
+		pr->run_len = 0;
+	}
+	pr->run_len += count;
+
+	return pr->err ? DBLIST_ABORT : 0;
+}
+
+errcode_t e2fsck_readahead_dblist(ext2_filsys fs, int flags,
+				  ext2_dblist dblist,
+				  unsigned long long start,
+				  unsigned long long count)
+{
+	errcode_t err;
+	struct read_dblist pr;
+
+	dbg_printf("%s: flags=0x%x\n", __func__, flags);
+	if (flags & ~E2FSCK_RA_DBLIST_ALL_FLAGS)
+		return EXT2_ET_INVALID_ARGUMENT;
+
+	memset(&pr, 0, sizeof(pr));
+	pr.flags = flags;
+	err = ext2fs_dblist_iterate3(dblist, readahead_dir_block, start,
+				     count, &pr);
+	if (pr.err)
+		return pr.err;
+	if (err)
+		return err;
+
+	if (pr.run_len)
+		err = io_channel_cache_readahead(fs->io, pr.run_start,
+						 pr.run_len);
+
+	return err;
+}
+
+errcode_t e2fsck_readahead(ext2_filsys fs, int flags, dgrp_t start,
+			   dgrp_t ngroups)
+{
+	blk64_t		super, old_gdt, new_gdt;
+	blk_t		blocks;
+	dgrp_t		i;
+	ext2_dblist	dblist;
+	dgrp_t		end = start + ngroups;
+	errcode_t	err = 0;
+
+	dbg_printf("%s: flags=0x%x start=%d groups=%d\n", __func__, flags,
+		   start, ngroups);
+	if (flags & ~E2FSCK_READA_ALL_FLAGS)
+		return EXT2_ET_INVALID_ARGUMENT;
+
+	if (end > fs->group_desc_count)
+		end = fs->group_desc_count;
+
+	if (flags == 0)
+		return 0;
+
+	err = ext2fs_init_dblist(fs, &dblist);
+	if (err)
+		return err;
+
+	for (i = start; i < end; i++) {
+		err = ext2fs_super_and_bgd_loc2(fs, i, &super, &old_gdt,
+						&new_gdt, &blocks);
+		if (err)
+			break;
+
+		if (flags & E2FSCK_READA_SUPER) {
+			err = ext2fs_add_dir_block2(dblist, 0, super, 0);
+			if (err)
+				break;
+		}
+
+		if (flags & E2FSCK_READA_GDT) {
+			if (old_gdt)
+				err = ext2fs_add_dir_block2(dblist, 0, old_gdt,
+							    blocks);
+			else if (new_gdt)
+				err = ext2fs_add_dir_block2(dblist, 0, new_gdt,
+							    blocks);
+			else
+				err = 0;
+			if (err)
+				break;
+		}
+
+		if ((flags & E2FSCK_READA_BBITMAP) &&
+		    !ext2fs_bg_flags_test(fs, i, EXT2_BG_BLOCK_UNINIT) &&
+		    ext2fs_bg_free_blocks_count(fs, i) <
+				fs->super->s_blocks_per_group) {
+			super = ext2fs_block_bitmap_loc(fs, i);
+			err = ext2fs_add_dir_block2(dblist, 0, super, 1);
+			if (err)
+				break;
+		}
+
+		if ((flags & E2FSCK_READA_IBITMAP) &&
+		    !ext2fs_bg_flags_test(fs, i, EXT2_BG_INODE_UNINIT) &&
+		    ext2fs_bg_free_inodes_count(fs, i) <
+				fs->super->s_inodes_per_group) {
+			super = ext2fs_inode_bitmap_loc(fs, i);
+			err = ext2fs_add_dir_block2(dblist, 0, super, 1);
+			if (err)
+				break;
+		}
+
+		if ((flags & E2FSCK_READA_ITABLE) &&
+		    ext2fs_bg_free_inodes_count(fs, i) <
+				fs->super->s_inodes_per_group) {
+			super = ext2fs_inode_table_loc(fs, i);
+			blocks = fs->inode_blocks_per_group -
+				 (ext2fs_bg_itable_unused(fs, i) *
+				  EXT2_INODE_SIZE(fs->super) / fs->blocksize);
+			err = ext2fs_add_dir_block2(dblist, 0, super, blocks);
+			if (err)
+				break;
+		}
+	}
+
+	if (!err) {
+		ext2fs_dblist_sort2(dblist, readahead_dir_block_cmp);
+		err = e2fsck_readahead_dblist(fs, 0, dblist, 0,
+					      ext2fs_dblist_count2(dblist));
+	}
+
+	ext2fs_free_dblist(dblist);
+	return err;
+}
+
+int e2fsck_can_readahead(ext2_filsys fs)
+{
+	errcode_t err;
+
+	err = io_channel_cache_readahead(fs->io, 0, 1);
+	dbg_printf("%s: supp=%d\n", __func__, err != EXT2_ET_OP_NOT_SUPPORTED);
+	return err != EXT2_ET_OP_NOT_SUPPORTED;
+}
+
+unsigned long long e2fsck_guess_readahead(ext2_filsys fs)
+{
+	unsigned long long guess;
+
+	/*
+	 * The optimal readahead sizes were experimentally determined by
+	 * djwong in August 2014.  Setting the RA size to one block group's
+	 * worth of inode table blocks seems to yield the largest reductions
+	 * in e2fsck runtime.
+	 */
+	guess = fs->blocksize * fs->inode_blocks_per_group;
+
+	/* Disable RA if it'd use more 1/100th of RAM. */
+	if (get_memory_size() > (guess * 100))
+		return guess / 1024;
+
+	return 0;
+}
diff --git a/e2fsck/util.c b/e2fsck/util.c
index 8237328..74f20062 100644
--- a/e2fsck/util.c
+++ b/e2fsck/util.c
@@ -37,6 +37,10 @@ 
 #include <errno.h>
 #endif
 
+#ifdef HAVE_SYS_SYSCTL_H
+#include <sys/sysctl.h>
+#endif
+
 #include "e2fsck.h"
 
 extern e2fsck_t e2fsck_global_ctx;   /* Try your very best not to use this! */
@@ -848,3 +852,50 @@  errcode_t e2fsck_allocate_subcluster_bitmap(ext2_filsys fs, const char *descr,
 	fs->default_bitmap_type = save_type;
 	return retval;
 }
+
+/* Return memory size in bytes */
+unsigned long long get_memory_size(void)
+{
+#if defined(_SC_PHYS_PAGES)
+# if defined(_SC_PAGESIZE)
+	return (unsigned long long)sysconf(_SC_PHYS_PAGES) *
+	       (unsigned long long)sysconf(_SC_PAGESIZE);
+# elif defined(_SC_PAGE_SIZE)
+	return (unsigned long long)sysconf(_SC_PHYS_PAGES) *
+	       (unsigned long long)sysconf(_SC_PAGE_SIZE);
+# endif
+#elif defined(CTL_HW)
+# if (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64))
+#  define CTL_HW_INT64
+# elif (defined(HW_PHYSMEM) || defined(HW_REALMEM))
+#  define CTL_HW_UINT
+# endif
+	int mib[2];
+
+	mib[0] = CTL_HW;
+# if defined(HW_MEMSIZE)
+	mib[1] = HW_MEMSIZE;
+# elif defined(HW_PHYSMEM64)
+	mib[1] = HW_PHYSMEM64;
+# elif defined(HW_REALMEM)
+	mib[1] = HW_REALMEM;
+# elif defined(HW_PYSMEM)
+	mib[1] = HW_PHYSMEM;
+# endif
+# if defined(CTL_HW_INT64)
+	unsigned long long size = 0;
+# elif defined(CTL_HW_UINT)
+	unsigned int size = 0;
+# endif
+# if defined(CTL_HW_INT64) || defined(CTL_HW_UINT)
+	size_t len = sizeof(size);
+
+	if (sysctl(mib, 2, &size, &len, NULL, 0) == 0)
+		return (unsigned long long)size;
+# endif
+	return 0;
+#else
+# warning "Don't know how to detect memory on your platform?"
+	return 0;
+#endif
+}
diff --git a/lib/config.h.in b/lib/config.h.in
index 1d2382b..b598d1e 100644
--- a/lib/config.h.in
+++ b/lib/config.h.in
@@ -500,6 +500,9 @@ 
 /* Define to 1 if you have the <sys/syscall.h> header file. */
 #undef HAVE_SYS_SYSCALL_H
 
+/* Define to 1 if you have the <sys/sysctl.h> header file. */
+#undef HAVE_SYS_SYSCTL_H
+
 /* Define to 1 if you have the <sys/sysmacros.h> header file. */
 #undef HAVE_SYS_SYSMACROS_H
 
diff --git a/lib/ext2fs/dblist.c b/lib/ext2fs/dblist.c
index 942c4f0..bbdb221 100644
--- a/lib/ext2fs/dblist.c
+++ b/lib/ext2fs/dblist.c
@@ -194,20 +194,25 @@  void ext2fs_dblist_sort2(ext2_dblist dblist,
 /*
  * This function iterates over the directory block list
  */
-errcode_t ext2fs_dblist_iterate2(ext2_dblist dblist,
+errcode_t ext2fs_dblist_iterate3(ext2_dblist dblist,
 				 int (*func)(ext2_filsys fs,
 					     struct ext2_db_entry2 *db_info,
 					     void	*priv_data),
+				 unsigned long long start,
+				 unsigned long long count,
 				 void *priv_data)
 {
-	unsigned long long	i;
+	unsigned long long	i, end;
 	int		ret;
 
 	EXT2_CHECK_MAGIC(dblist, EXT2_ET_MAGIC_DBLIST);
 
+	end = start + count;
 	if (!dblist->sorted)
 		ext2fs_dblist_sort2(dblist, 0);
-	for (i=0; i < dblist->count; i++) {
+	if (end > dblist->count)
+		end = dblist->count;
+	for (i = start; i < end; i++) {
 		ret = (*func)(dblist->fs, &dblist->list[i], priv_data);
 		if (ret & DBLIST_ABORT)
 			return 0;
@@ -215,6 +220,16 @@  errcode_t ext2fs_dblist_iterate2(ext2_dblist dblist,
 	return 0;
 }
 
+errcode_t ext2fs_dblist_iterate2(ext2_dblist dblist,
+				 int (*func)(ext2_filsys fs,
+					     struct ext2_db_entry2 *db_info,
+					     void	*priv_data),
+				 void *priv_data)
+{
+	return ext2fs_dblist_iterate3(dblist, func, 0, dblist->count,
+				      priv_data);
+}
+
 static EXT2_QSORT_TYPE dir_block_cmp2(const void *a, const void *b)
 {
 	const struct ext2_db_entry2 *db_a =
diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
index b4a9f84..04a1f4a 100644
--- a/lib/ext2fs/ext2fs.h
+++ b/lib/ext2fs/ext2fs.h
@@ -1052,11 +1052,17 @@  extern void ext2fs_dblist_sort2(ext2_dblist dblist,
 extern errcode_t ext2fs_dblist_iterate(ext2_dblist dblist,
 	int (*func)(ext2_filsys fs, struct ext2_db_entry *db_info,
 		    void	*priv_data),
-       void *priv_data);
+	void *priv_data);
 extern errcode_t ext2fs_dblist_iterate2(ext2_dblist dblist,
 	int (*func)(ext2_filsys fs, struct ext2_db_entry2 *db_info,
 		    void	*priv_data),
-       void *priv_data);
+	void *priv_data);
+extern errcode_t ext2fs_dblist_iterate3(ext2_dblist dblist,
+	int (*func)(ext2_filsys fs, struct ext2_db_entry2 *db_info,
+		    void	*priv_data),
+	unsigned long long start,
+	unsigned long long count,
+	void *priv_data);
 extern errcode_t ext2fs_set_dir_block(ext2_dblist dblist, ext2_ino_t ino,
 				      blk_t blk, int blockcnt);
 extern errcode_t ext2fs_set_dir_block2(ext2_dblist dblist, ext2_ino_t ino,