diff mbox

[RFC] ext4: New inode/block allocation algorithms for flex_bg filesystems

Message ID 20090218154310.GH3600@mini-me.lan
State Accepted, archived
Headers show

Commit Message

Theodore Ts'o Feb. 18, 2009, 3:43 p.m. UTC
The find_group_flex() inode allocator is now only used if the
filesystem is mounted using the "oldalloc" mount option.  It is
replaced with the original Orlov allocator that has been updated for
flex_bg filesystems (it should behave the same way if flex_bg is
disabled).  The inode allocator now functions by taking into account
each flex_bg group, instead of each block group, when deciding whether
or not it's time to allocate a new directory into a fresh flex_bg.

The block allocator has also been changed so that the first block
group in each flex_bg is preferred for use for storing directory
blocks.  This keeps directory blocks close together, which is good for
speeding up e2fsck since large directories are more likely to look
like this:

debugfs:  stat /home/tytso/Maildir/cur
Inode: 1844562   Type: directory    Mode:  0700   Flags: 0x81000
Generation: 1132745781    Version: 0x00000000:0000ad71
User: 15806   Group: 15806   Size: 1060864
File ACL: 0    Directory ACL: 0
Links: 2   Blockcount: 2072
Fragment:  Address: 0    Number: 0    Size: 0
 ctime: 0x499c0ff4:164961f4 -- Wed Feb 18 08:41:08 2009
 atime: 0x499c0ff4:00000000 -- Wed Feb 18 08:41:08 2009
 mtime: 0x49957f51:00000000 -- Fri Feb 13 09:10:25 2009
crtime: 0x499c0f57:00d51440 -- Wed Feb 18 08:38:31 2009
Size of extra inode fields: 28
BLOCKS:
(0):7348651, (1-258):7348654-7348911
TOTAL: 259

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4_i.h  |    3 +
 fs/ext4/extents.c |   17 ++++-
 fs/ext4/ialloc.c  |  199 ++++++++++++++++++++++++++++++++++++++++-------------
 fs/ext4/inode.c   |   18 +++++-
 4 files changed, 186 insertions(+), 51 deletions(-)

Comments

Aneesh Kumar K.V Feb. 24, 2009, 8:59 a.m. UTC | #1
4 files changed, 186 insertions(+), 51 deletions(-)
> 
> diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
> index 2d516c0..4ce2187 100644
> --- a/fs/ext4/ext4_i.h
> +++ b/fs/ext4/ext4_i.h
> @@ -122,6 +122,9 @@ struct ext4_inode_info {
>  	struct list_head i_prealloc_list;
>  	spinlock_t i_prealloc_lock;
> 
> +	/* ialloc */
> +	ext4_group_t	i_last_alloc_group;
> +
>  	/* allocation reservation info for delalloc */
>  	unsigned int i_reserved_data_blocks;
>  	unsigned int i_reserved_meta_blocks;
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index e2eab19..0aeb8c2 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
>  	ext4_fsblk_t bg_start;
>  	ext4_fsblk_t last_block;
>  	ext4_grpblk_t colour;
> +	ext4_group_t block_group;
> +	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
>  	int depth;
> 
>  	if (path) {
> @@ -170,10 +172,23 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
>  	}
> 
>  	/* OK. use inode's group */
> -	bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
> +	block_group = ei->i_block_group;
> +	if (flex_size >= 4) {
> +		block_group &= ~(flex_size-1);
> +		if (S_ISREG(inode->i_mode))
> +			block_group++;
> +	}


Can you explain why we select 4 here ?


Also add a comment explaining directory/symlink block allocation goes to
first group of flex_bg and regular files goes to second group and which
type of workload that would help ? You have the comment in commit message. 


> +	bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
>  		le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
>  	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
> 
> +	/* 
> +	 * If we are doing delayed allocation, we don't need take
> +	 * colour into account.
> +	 */
> +	if (test_opt(inode->i_sb, DELALLOC))
> +		return bg_start;
> +

Again why we don't want to look at colour for delayed allocation ?


>  	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
>  		colour = (current->pid % 16) *
>  			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index 21080ab..212e5be 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -409,6 +409,42 @@ out:
>  }
> 
>  /*
> + * Helper function for Orlov's allocator; returns critical information 
> + * for a particular block group or flex_bg
> + */
> +struct orlov_stats {
> +	__u32 free_inodes;
> +	__u32 free_blocks;
> +	__u32 used_dirs;  
> +};
> +
> +void get_orlov_stats(struct super_block *sb, ext4_group_t g, 
> +		       int flex_size, struct orlov_stats *stats)
> +{
> +	struct ext4_group_desc *desc;
> +	ext4_group_t		ngroups = EXT4_SB(sb)->s_groups_count;
> +	int			i;
> +
> +	stats->free_inodes = 0;
> +	stats->free_blocks = 0;
> +	stats->used_dirs = 0;
> +
> +	g *= flex_size;


Can you add a comment to the function saying g can be flex group number
or the actual group number depending on flex_size ?. Without that
comment the above operation can be confusing.

> +
> +	for (i = 0; i < flex_size; i++) {
> +		if (g >= ngroups)
> +			break;
> +		desc = ext4_get_group_desc(sb, g++, NULL);
> +		if (!desc)
> +			continue;
> +
> +		stats->free_inodes += ext4_free_inodes_count(sb, desc);
> +		stats->free_blocks += ext4_free_blks_count(sb, desc);
> +		stats->used_dirs += ext4_used_dirs_count(sb, desc);
> +	}
> +}
> +
> +/*
>   * Orlov's allocator for directories.
>   *
>   * We always try to spread first-level directories.
> @@ -423,7 +459,6 @@ out:
>   * it has too many directories already (max_dirs) or
>   * it has too few free inodes left (min_inodes) or
>   * it has too few free blocks left (min_blocks) or
> - * it's already running too large debt (max_debt).
>   * Parent's group is preferred, if it doesn't satisfy these
>   * conditions we search cyclically through the rest. If none
>   * of the groups look good we just look for a group with more
> @@ -437,7 +472,7 @@ out:
>  #define BLOCK_COST 256

You can also remove further description about debt and INODE_COST and
BLOCK_COST



> 
>  static int find_group_orlov(struct super_block *sb, struct inode *parent,
> -				ext4_group_t *group)
> +			    ext4_group_t *group, int mode)
>  {
>  	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
>  	struct ext4_sb_info *sbi = EXT4_SB(sb);
> @@ -446,12 +481,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
>  	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
>  	unsigned int freei, avefreei;
>  	ext4_fsblk_t freeb, avefreeb;
> -	ext4_fsblk_t blocks_per_dir;
>  	unsigned int ndirs;
> -	int max_debt, max_dirs, min_inodes;
> +	int max_dirs, min_inodes;
>  	ext4_grpblk_t min_blocks;
> -	ext4_group_t i;
> +	ext4_group_t i, grp, g;
>  	struct ext4_group_desc *desc;
> +	struct orlov_stats stats;
> +	int flex_size = ext4_flex_bg_size(sbi);
> +
> +	if (flex_size < 4)
> +		flex_size = 1;
> +	else {
> +		ngroups = (ngroups + flex_size - 1) >>
> +			sbi->s_log_groups_per_flex;
> +		parent_group >>= sbi->s_log_groups_per_flex;
> +	}
> 
>  	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
>  	avefreei = freei / ngroups;
> @@ -460,71 +504,85 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
>  	do_div(avefreeb, ngroups);
>  	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
> 
> -	if ((parent == sb->s_root->d_inode) ||
> -	    (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
> +	if (S_ISDIR(mode) &&
> +	    ((parent == sb->s_root->d_inode) ||
> +	     (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
>  		int best_ndir = inodes_per_group;
> -		ext4_group_t grp;
>  		int ret = -1;
> 
>  		get_random_bytes(&grp, sizeof(grp));
>  		parent_group = (unsigned)grp % ngroups;
>  		for (i = 0; i < ngroups; i++) {
> -			grp = (parent_group + i) % ngroups;
> -			desc = ext4_get_group_desc(sb, grp, NULL);
> -			if (!desc || !ext4_free_inodes_count(sb, desc))
> +			g = (parent_group + i) % ngroups;
> +			get_orlov_stats(sb, g, flex_size, &stats);
> +			if (!stats.free_inodes)
>  				continue;
> -			if (ext4_used_dirs_count(sb, desc) >= best_ndir)
> +			if (stats.used_dirs >= best_ndir)
>  				continue;
> -			if (ext4_free_inodes_count(sb, desc) < avefreei)
> +			if (stats.free_inodes < avefreei)
>  				continue;
> -			if (ext4_free_blks_count(sb, desc) < avefreeb)
> +			if (stats.free_blocks < avefreeb)
>  				continue;
> -			*group = grp;
> +			grp = g;
>  			ret = 0;
> -			best_ndir = ext4_used_dirs_count(sb, desc);
> +			best_ndir = stats.used_dirs;
> +		}
> +		if (ret)
> +			goto fallback;
> +	found_flex_bg:
> +		if (flex_size == 1) {
> +			*group = grp;
> +			return 0;
> +		}
> +
> +		grp *= flex_size;
> +		for (i = 1; i < flex_size; i++) {

Why we start  with i = 1 ?


> +			if (grp+i >= sbi->s_groups_count)
> +				break;
> +			desc = ext4_get_group_desc(sb, grp+i, NULL);
> +			if (desc && ext4_free_inodes_count(sb, desc)) {


Can you add a comment saying that we just  pick the first group with
free inode because the goal block for rest of the block allocation
of the file/directory looks at the flex block group number with flex_bg. (more
details on ext4_ext_find_goal)


> +				*group = grp+i;
> +				return 0;
> +			}
> +		}
> +		desc = ext4_get_group_desc(sb, grp, NULL);
> +		if (desc && ext4_free_inodes_count(sb, desc)) {
> +			*group = grp;
> +			return 0;
>  		}
> -		if (ret == 0)
> -			return ret;
>  		goto fallback;
>  	}
> 
> -	blocks_per_dir = ext4_blocks_count(es) - freeb;
> -	do_div(blocks_per_dir, ndirs);
> -
>  	max_dirs = ndirs / ngroups + inodes_per_group / 16;
> -	min_inodes = avefreei - inodes_per_group / 4;
> -	min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
> -
> -	max_debt = EXT4_BLOCKS_PER_GROUP(sb);
> -	max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
> -	if (max_debt * INODE_COST > inodes_per_group)
> -		max_debt = inodes_per_group / INODE_COST;
> -	if (max_debt > 255)
> -		max_debt = 255;
> -	if (max_debt == 0)
> -		max_debt = 1;
> +	min_inodes = avefreei - inodes_per_group*flex_size / 4;
> +	if (min_inodes < 1)
> +		min_inodes = 1;
> +	min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
> 
>  	for (i = 0; i < ngroups; i++) {
> -		*group = (parent_group + i) % ngroups;
> -		desc = ext4_get_group_desc(sb, *group, NULL);
> -		if (!desc || !ext4_free_inodes_count(sb, desc))
> +		grp = (parent_group + i) % ngroups;
> +		get_orlov_stats(sb, grp, flex_size, &stats);
> +		if (stats.used_dirs >= max_dirs)
>  			continue;
> -		if (ext4_used_dirs_count(sb, desc) >= max_dirs)
> +		if (stats.free_inodes < min_inodes)
>  			continue;
> -		if (ext4_free_inodes_count(sb, desc) < min_inodes)
> +		if (stats.free_blocks < min_blocks)
>  			continue;
> -		if (ext4_free_blks_count(sb, desc) < min_blocks)
> -			continue;
> -		return 0;
> +		goto found_flex_bg;
>  	}
> 
>  fallback:
> +	ngroups = sbi->s_groups_count;
> +	avefreei = freei / ngroups;
> +	parent_group = EXT4_I(parent)->i_block_group;
>  	for (i = 0; i < ngroups; i++) {
> -		*group = (parent_group + i) % ngroups;
> -		desc = ext4_get_group_desc(sb, *group, NULL);
> +		grp = (parent_group + i) % ngroups;
> +		desc = ext4_get_group_desc(sb, grp, NULL);
>  		if (desc && ext4_free_inodes_count(sb, desc) &&
> -			ext4_free_inodes_count(sb, desc) >= avefreei)
> +		    ext4_free_inodes_count(sb, desc) >= avefreei) {
> +			*group = grp;
>  			return 0;
> +		}
>  	}
> 
>  	if (avefreei) {
> @@ -540,12 +598,54 @@ fallback:
>  }
> 
>  static int find_group_other(struct super_block *sb, struct inode *parent,
> -				ext4_group_t *group)
> +			    ext4_group_t *group, int mode)
>  {
>  	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
>  	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
>  	struct ext4_group_desc *desc;
> -	ext4_group_t i;
> +	ext4_group_t i, last;
> +	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
> +
> +	/*
> +	 * If we are doing flex_bg style allocation, try to put
> +	 * special inodes in the first block group; start files and
> +	 * directories at the 2nd block group in the flex_bg.
> +	 */

Why ? Can you explain whether this placing helps any specific work load
? or something where you have observed that this placement helps ?  

> +	if (flex_size >= 4) {
> +		int ret, retry = 0;
> +
> +	try_again:
> +		i = (parent_group & ~(flex_size-1));
> +		last = i+flex_size;
> +		if (last > ngroups)
> +			last = ngroups;
> +		if (S_ISDIR(mode) || S_ISREG(mode))
> +			i++;
> +		while (i < last) {
> +			desc = ext4_get_group_desc(sb, i, NULL);
> +			if (desc && ext4_free_inodes_count(sb, desc)) {
> +				*group = i;
> +				return 0;
> +			}
> +			i++;
> +		}
> +		if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
> +			retry = 1;
> +			parent_group = EXT4_I(parent)->i_last_alloc_group;
> +			goto try_again;
> +		}
> +		/*
> +		 * If this didn't work, use the Orlov search
> +		 * algorithm; we pass in the mode to avoid the subdir
> +		 * of topdir algorithms.  If successful, save the
> +		 * group that we used so that future allocations in
> +		 * that directory are stable.
> +		 */
> +		ret = find_group_orlov(sb, parent, &parent_group, mode);
> +		if (ret == 0)
> +			EXT4_I(parent)->i_last_alloc_group = parent_group;
> +		return ret;
> +	}
> 
>  	/*
>  	 * Try to place the inode in its parent directory
> @@ -713,10 +813,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
>  	sbi = EXT4_SB(sb);
>  	es = sbi->s_es;
> 
> -	if (sbi->s_log_groups_per_flex) {
> +	if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
>  		ret2 = find_group_flex(sb, dir, &group);
>  		if (ret2 == -1) {
> -			ret2 = find_group_other(sb, dir, &group);
> +			ret2 = find_group_other(sb, dir, &group, mode);
>  			if (ret2 == 0)
>  				printk(KERN_NOTICE "ext4: find_group_flex "
>  				       "failed, fallback succeeded dir %lu\n",
> @@ -729,9 +829,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
>  		if (test_opt(sb, OLDALLOC))
>  			ret2 = find_group_dir(sb, dir, &group);
>  		else
> -			ret2 = find_group_orlov(sb, dir, &group);
> +			ret2 = find_group_orlov(sb, dir, &group, mode);
>  	} else
> -		ret2 = find_group_other(sb, dir, &group);
> +		ret2 = find_group_other(sb, dir, &group, mode);
> 
>  got_group:
>  	err = -ENOSPC;
> @@ -890,6 +990,7 @@ got:
>  	ei->i_file_acl = 0;
>  	ei->i_dtime = 0;
>  	ei->i_block_group = group;
> +	ei->i_last_alloc_group = ~0;
> 
>  	ext4_set_inode_flags(inode);
>  	if (IS_DIRSYNC(inode))
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index cbd2ca9..12f1374 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -459,6 +459,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
>  	ext4_fsblk_t bg_start;
>  	ext4_fsblk_t last_block;
>  	ext4_grpblk_t colour;
> +	ext4_group_t block_group;
> +	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
> 
>  	/* Try to find previous block */
>  	for (p = ind->p - 1; p >= start; p--) {
> @@ -474,9 +476,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
>  	 * It is going to be referred to from the inode itself? OK, just put it
>  	 * into the same cylinder group then.
>  	 */
> -	bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
> +	block_group = ei->i_block_group;
> +	if (flex_size >= 4) {
> +		block_group &= ~(flex_size-1);
> +		if (S_ISREG(inode->i_mode))
> +			block_group++;
> +	}
> +	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
>  	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
> 
> +	/* 
> +	 * If we are doing delayed allocation, we don't need take
> +	 * colour into account.
> +	 */
> +	if (test_opt(inode->i_sb, DELALLOC))
> +		return bg_start;
> +
>  	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
>  		colour = (current->pid % 16) *
>  			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
> @@ -4250,6 +4265,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>  	ei->i_disksize = inode->i_size;
>  	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
>  	ei->i_block_group = iloc.block_group;
> +	ei->i_last_alloc_group = ~0;
>  	/*
>  	 * NOTE! The in-memory inode i_data array is in little-endian order
>  	 * even on big-endian machines: we do NOT byteswap the block numbers!
> -- 

-aneesh
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o Feb. 24, 2009, 3:27 p.m. UTC | #2
On Tue, Feb 24, 2009 at 02:29:31PM +0530, Aneesh Kumar K.V wrote:
> >  	/* OK. use inode's group */
> > -	bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
> > +	block_group = ei->i_block_group;
> > +	if (flex_size >= 4) {
> > +		block_group &= ~(flex_size-1);
> > +		if (S_ISREG(inode->i_mode))
> > +			block_group++;
> > +	}
> 
> 
> Can you explain why we select 4 here ?
> 
> Also add a comment explaining directory/symlink block allocation goes to
> first group of flex_bg and regular files goes to second group and which
> type of workload that would help ? You have the comment in commit message. 

Yeah, I'll add a comment there.

> > +	/* 
> > +	 * If we are doing delayed allocation, we don't need take
> > +	 * colour into account.
> > +	 */
> > +	if (test_opt(inode->i_sb, DELALLOC))
> > +		return bg_start;
> > +
> 
> Again why we don't want to look at colour for delayed allocation ?

If we're doing delayed allocation, we'll be allocating data blocks in
large chunks at a time.  Colour is much more important if you have
multiple processes allocating singleton blocks at a time, so you don't
get interleved allocation (i.e, ABABAB or ABCABBCABC).  But if we're
grabbing chunks of blocks at a time, the benefits largely go away, and
in fact, artificially starting at different location depending on the
process id can actually lead to a greater fragmentation of the free
space.

> >  /*
> > + * Helper function for Orlov's allocator; returns critical information 
> > + * for a particular block group or flex_bg
> > + */
> > +struct orlov_stats {
> > +	__u32 free_inodes;
> > +	__u32 free_blocks;
> > +	__u32 used_dirs;  
> > +};
> > +
> > +void get_orlov_stats(struct super_block *sb, ext4_group_t g, 
> > +		       int flex_size, struct orlov_stats *stats)
> > +{
		...
> > +	g *= flex_size;
> 
> Can you add a comment to the function saying g can be flex group number
> or the actual group number depending on flex_size ?. Without that
> comment the above operation can be confusing.

Sure.

> > +/*
> >   * Orlov's allocator for directories.
> >   *
> 
> You can also remove further description about debt and INODE_COST and
> BLOCK_COST
> 

Yep, good point.


> > +	found_flex_bg:
> > +		if (flex_size == 1) {
> > +			*group = grp;
> > +			return 0;
> > +		}
> > +
> > +		grp *= flex_size;
> > +		for (i = 1; i < flex_size; i++) {
> 
> Why we start  with i = 1 ?
> 

I was putting directories in the second bg of flexgroups; thinking
about it some more, there's really no good reason to do that.  I'll
change that back to be one.

> Can you add a comment saying that we just pick the first group with
> free inode because the goal block for rest of the block allocation
> of the file/directory looks at the flex block group number with
> flex_bg. (more details on ext4_ext_find_goal)

I'll do that.

> > +	/*
> > +	 * If we are doing flex_bg style allocation, try to put
> > +	 * special inodes in the first block group; start files and
> > +	 * directories at the 2nd block group in the flex_bg.
> > +	 */
> 
> Why ? Can you explain whether this placing helps any specific work load
> ? or something where you have observed that this placement helps ?  

This was left over from when I was using the inode number to influence
block allocation.  We're not doing this any more, so this should go
away.   Thanks for asking the question.

							- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o Feb. 24, 2009, 7:04 p.m. UTC | #3
On Tue, Feb 24, 2009 at 10:27:34AM -0500, Theodore Tso wrote:
> > > +	/*
> > > +	 * If we are doing flex_bg style allocation, try to put
> > > +	 * special inodes in the first block group; start files and
> > > +	 * directories at the 2nd block group in the flex_bg.
> > > +	 */
> > 
> > Why ? Can you explain whether this placing helps any specific work load
> > ? or something where you have observed that this placement helps ?  
> 
> This was left over from when I was using the inode number to influence
> block allocation.  We're not doing this any more, so this should go
> away.   Thanks for asking the question.

Hm, I just tried taking it out, and it costs a 17% increase in e2fsck
time on my test filesystem.  The reason is pass 2, we need to check to
make sure the filetype information in the directory blocks is correct.
If the inode in question is a regular file or a directory, we can
determine that by looking at an inode bitmap.  However, if it is a
named pipe, device file, or symlink, we can only determine what it is
by reading the inode.  In the filesystem in question, which is an
Ubuntu Intrepid image, there are 5 charater device files, 1 block
device file, 5 named pipes --- and 20,613 symbolic links.  If we group
all of these inodes togehter, it saves about 3 seconds in pass 2 (13
seconds versus 17 seconds).

We can also solve this problem by caching the file type information.
For example, I could store a list of all symlink inodes, and if there
are only 20k symlinks, it will end up costing us 80k of memory.  So
this might be a problem which is better solved with some e2fsck
hacking (which has the advantage that it will speed up ext3 fsck runs
as well).

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger Feb. 24, 2009, 10:41 p.m. UTC | #4
Ted Ts'o wrote something like the following (didnt' get original email):
> @@ -122,6 +122,9 @@ struct ext4_inode_info {
>  	struct list_head i_prealloc_list;
>  	spinlock_t i_prealloc_lock;
> 
> +	/* ialloc */
> +	ext4_group_t	i_last_alloc_group;

Even better would be to store i_last_alloc_inode.  In the past Eric
has demonstrated workloads that are allocating lots of inodes exhibit
O(n^2) behaviour because the entire group bitmap is searched from the
start each time, and that can cumulatively be very slow.  Having the
directory start searching from the most recently allocated inode would
make this O(n), and would not significantly alter behaviour.

If the inode is newly read from disk this would be initialized to 0,
or the inode number of the directory.  If it was recently creating
files it should be the previously allocated inode.  The only issue
is with inodes being repeatedly created and deleted in the same
directory, in which case we could reset this at delete time to be

    max(first inode in group, directory inode)

so that the same inodes keep being reused by the directory.  This could
be limited to clearing last_alloc_inode if the inode being freed is in
the parent group.

Hmm, it seems something like this is already done in find_group_orlov().
More discussion there.

> @@ -170,10 +172,23 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
> +	if (flex_size >= 4) {
> +		block_group &= ~(flex_size-1);
> +		if (S_ISREG(inode->i_mode))
> +			block_group++;
> +	}

The usage of "4" in several places to threshold flexbg size is also
confusing to me.

> +	bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
>  		le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
>  	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;

Similar to the above proposal, Eric and I discussed storing the most
recently selected (flex) group in the superblock would avoid repeated
group scanning for allocations in separate directories.

> @@ -446,12 +481,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
> +	if (flex_size < 4)
> +		flex_size = 1;
> +	else {

Generally CodingStyle has { } around the first "if" clause if they are
around the "else" clause.

> @@ -460,71 +504,85 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
> +	if (S_ISDIR(mode) &&
> +	    ((parent == sb->s_root->d_inode) ||
> +	     (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
> 
>  		get_random_bytes(&grp, sizeof(grp));
>  		parent_group = (unsigned)grp % ngroups;
>  		for (i = 0; i < ngroups; i++) {
> +			g = (parent_group + i) % ngroups;
> +			get_orlov_stats(sb, g, flex_size, &stats);

Two notes here -
- get_random_bytes() is actually a fairly CPU-intensive operation, though
  I'm not sure it is significant in this context
- more importantly, scanning ALL groups in a very large filesystem to find
  an optimal group.  Large filesystems can have upwards of 60-100k groups
  to scan, and even with flexbg it seems that get_orlov_stats() is not
  using the flex_group[] stats in the common case where flex_size is the
  same as the flexbg size, but recalculating these for every allocation.

> +			if (!stats.free_inodes)
>  				continue;
> +			if (stats.used_dirs >= best_ndir)
>  				continue;
> +			if (stats.free_inodes < avefreei)
>  				continue;
> +			if (stats.free_blocks < avefreeb)
>  				continue;
> +			grp = g;
>  			ret = 0;
> +			best_ndir = stats.used_dirs;

Based on the above, it would be preferable to settle for a "local"
optimal group after scanning N successful candidates, then checking
the "most recently selected" group saved in the superblock if it is
still better than the local optimum.  Of course it should continue to
scan all groups if there are no suitable groups yet found.

Since the starting group is random, this will avoid being stuck in
a bad local minimum, while avoiding an exhaustive search when there
are a lot of groups.

> +	min_inodes = avefreei - inodes_per_group*flex_size / 4;
> +	min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;

style - "inodes_per_group * flex_size / 4"

>  static int find_group_other(struct super_block *sb, struct inode *parent,
> +			    ext4_group_t *group, int mode)
>  {
> +	/*
> +	 * If we are doing flex_bg style allocation, try to put
> +	 * special inodes in the first block group; start files and
> +	 * directories at the 2nd block group in the flex_bg.
> +	 */
> +	if (flex_size >= 4) {
> +		int ret, retry = 0;
> +
> +	try_again:
> +		i = (parent_group & ~(flex_size-1));
> +		last = i+flex_size;

Can you please use a better variable name than "i", "grp", or
"new_group" or similar?

> +		while (i < last) {
> +			desc = ext4_get_group_desc(sb, i, NULL);
> +			if (desc && ext4_free_inodes_count(sb, desc)) {
> +				*group = i;
> +				return 0;
> +			}
> +			i++;
> +		}
> +		if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
> +			retry = 1;
> +			parent_group = EXT4_I(parent)->i_last_alloc_group;
> +			goto try_again;
> +		}

It would seem that using i_last_alloc_group FIRST (if set) would be better
than always checking all groups in the flexbg.  Either i_last_alloc_group
is in the same flex group as parent_group (so no harm done) or a previous
allocation wasn't able to find any free inode in that flex group and it
had to do a more expensive search elsewhere.

All that needs to be changed is to possibly reset i_last_alloc_group
to be parent_group when there is a file unlinked from the directory,
though this wouldn't be great for huge directories since any unlink
would cause a full scan.

Hmm, maybe checking ONLY the parent group for free inodes, then the
i_last_alloc_group, then remaining groups in the flex group, then
all remaining groups is the best ordering.  It allows inodes to be
clustered if there is space in the parent group (whether due to freeing
inodes in this directory or others), but avoids a potentially lengthy
scan for each inode (which might be noticable if there are 64 or 128
groups per flexbg).



Another possibility is if the starting goal inode was based on the group
a majority of dirent inode numbers in the same directory leaf.  This would
require deferring inode number selection until the dirent is about to be
inserted (i.e. delalloc for inodes) by moving the inode selection into
ext4_add_entry().  Combining this with a hash->inode range mapping
(as I've proposed in the past, range size based on the directory size)
would improve the htree->inode ordering and would also improve performance
for large directories.

It might not be a bad idea to separate the inode selection from the
inode initialization anyway, as ext4_new_inode() is pretty huge already,
and I can't see anything except the i_ino setting and insert_inode_hash()
that is use the inode number.  That means it should be possible to defer
the inode selection to ext4_add_entry() and ext4_dx_add_entry() once the
leaf block is found.


Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Sandeen Feb. 25, 2009, 12:57 a.m. UTC | #5
Andreas Dilger wrote:
> Ted Ts'o wrote something like the following (didnt' get original email):
>> @@ -122,6 +122,9 @@ struct ext4_inode_info {
>>  	struct list_head i_prealloc_list;
>>  	spinlock_t i_prealloc_lock;
>>
>> +	/* ialloc */
>> +	ext4_group_t	i_last_alloc_group;
> 
> Even better would be to store i_last_alloc_inode.  In the past Eric
> has demonstrated workloads that are allocating lots of inodes exhibit
> O(n^2) behaviour because the entire group bitmap is searched from the
> start each time, and that can cumulatively be very slow.  Having the
> directory start searching from the most recently allocated inode would
> make this O(n), and would not significantly alter behaviour.

A very hacky benchmark I had to demonstrate this is at

It just creates a directory tree starting at 000/ under the dir it's run
in, and times iterations of creates.

The tree is created in order, like:

000/000/000/000/000/000
000/000/000/000/000/001
000/000/000/000/000/002
...
000/000/000/000/000/fff
000/000/000/000/001/000
....

On ext3:

# ./seq_mkdirs
iter 0: 6.191491 sec
iter 1: 8.455782 sec
iter 2: 9.435375 sec
iter 3: 10.198069 sec
iter 4: 10.922969 sec
iter 5: 10.800908 sec
iter 6: 12.940676 sec
iter 7: 15.513261 sec
...

On upstream ext4:

# ./seq_mkdirs
iter 0: 5.628331 sec
iter 1: 6.581043 sec
iter 2: 6.723445 sec
iter 3: 6.567891 sec
iter 4: 5.862526 sec
iter 5: 6.462064 sec
iter 6: 7.208110 sec
iter 7: 6.549735 sec
...


I did play with saving the last-allocated position but if that's just
in-memory then it's a little odd that the first allocation will be
potentially much slower, but that's probably acceptable.  It also
wouldn't fill in gaps when inodes are deleted if you don't re-search
from the parent.  ISTR that the constant create/delete didn't cause a
problem, will need to remind myself why ...

-Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Sandeen Feb. 25, 2009, 12:58 a.m. UTC | #6
Eric Sandeen wrote:
> Andreas Dilger wrote:
>> Ted Ts'o wrote something like the following (didnt' get original email):
>>> @@ -122,6 +122,9 @@ struct ext4_inode_info {
>>>  	struct list_head i_prealloc_list;
>>>  	spinlock_t i_prealloc_lock;
>>>
>>> +	/* ialloc */
>>> +	ext4_group_t	i_last_alloc_group;
>> Even better would be to store i_last_alloc_inode.  In the past Eric
>> has demonstrated workloads that are allocating lots of inodes exhibit
>> O(n^2) behaviour because the entire group bitmap is searched from the
>> start each time, and that can cumulatively be very slow.  Having the
>> directory start searching from the most recently allocated inode would
>> make this O(n), and would not significantly alter behaviour.
> 
> A very hacky benchmark I had to demonstrate this is at

Er, URL please, maestro...

http://people.redhat.com/esandeen/benchmarks/seq_mkdirs.c

> It just creates a directory tree starting at 000/ under the dir it's run
> in, and times iterations of creates.
> 
> The tree is created in order, like:
> 
> 000/000/000/000/000/000
> 000/000/000/000/000/001
> 000/000/000/000/000/002
> ...
> 000/000/000/000/000/fff
> 000/000/000/000/001/000
> ....
> 
> On ext3:
> 
> # ./seq_mkdirs
> iter 0: 6.191491 sec
> iter 1: 8.455782 sec
> iter 2: 9.435375 sec
> iter 3: 10.198069 sec
> iter 4: 10.922969 sec
> iter 5: 10.800908 sec
> iter 6: 12.940676 sec
> iter 7: 15.513261 sec
> ...
> 
> On upstream ext4:
> 
> # ./seq_mkdirs
> iter 0: 5.628331 sec
> iter 1: 6.581043 sec
> iter 2: 6.723445 sec
> iter 3: 6.567891 sec
> iter 4: 5.862526 sec
> iter 5: 6.462064 sec
> iter 6: 7.208110 sec
> iter 7: 6.549735 sec
> ...
> 
> 
> I did play with saving the last-allocated position but if that's just
> in-memory then it's a little odd that the first allocation will be
> potentially much slower, but that's probably acceptable.  It also
> wouldn't fill in gaps when inodes are deleted if you don't re-search
> from the parent.  ISTR that the constant create/delete didn't cause a
> problem, will need to remind myself why ...
> 
> -Eric
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o Feb. 25, 2009, 2:50 a.m. UTC | #7
> Two notes here -
> - get_random_bytes() is actually a fairly CPU-intensive operation, though
>   I'm not sure it is significant in this context
> - more importantly, scanning ALL groups in a very large filesystem to find
>   an optimal group.  Large filesystems can have upwards of 60-100k groups
>   to scan, and even with flexbg it seems that get_orlov_stats() is not
>   using the flex_group[] stats in the common case where flex_size is the
>   same as the flexbg size, but recalculating these for every allocation.

Note that this only happens when we are creating directories.  I'm
making the assumption here that creating directories is rare, and
certainly happens less often than allocating inodes or blocks.  I was
deliberately not using flex_group[] stats, since I was thinking that
eventually it would be worthwhile to nuke the need to take a spinlock
for every inode and block allocation, and it wasn't keeping track of
the number of directories, anyway.

We also don't actually scan ALL the groups.  For top-level directories
we do, yes.  But most directories aren't top-level; and for non
top-level directories, we find the first block group which meets the
minimum criteria:

	for (i = 0; i < ngroups; i++) {
		grp = (parent_group + i) % ngroups;
		get_orlov_stats(sb, grp, flex_size, &stats);
		if (stats.used_dirs >= max_dirs)
			continue;
		if (stats.free_inodes < min_inodes)
			continue;
		if (stats.free_blocks < min_blocks)
			continue;
		goto found_flex_bg;
	}

Looking at Eric's benchmark, it looks like each iteration is creating
196,608 directories for each iteration.  If takes 6 seconds, then each
mkdir is taking 30 microseconds, and if takes 15 seconds to do an
interation, then each mkdir is taking 76 microseconds.  So that
variance is large, but how bad does it really get?

Also, what workload is likely going to be creating tons and tons of
directories?  Even a psycho squid cache hierarchy is only 65536
directories.

> Based on the above, it would be preferable to settle for a "local"
> optimal group after scanning N successful candidates, then checking
> the "most recently selected" group saved in the superblock if it is
> still better than the local optimum.  Of course it should continue to
> scan all groups if there are no suitable groups yet found.
> 
> Since the starting group is random, this will avoid being stuck in
> a bad local minimum, while avoiding an exhaustive search when there
> are a lot of groups.

The above algorithm is only used for top-level directories.

> It would seem that using i_last_alloc_group FIRST (if set) would be better
> than always checking all groups in the flexbg.  Either i_last_alloc_group
> is in the same flex group as parent_group (so no harm done) or a previous
> allocation wasn't able to find any free inode in that flex group and it
> had to do a more expensive search elsewhere.

(This is the algorithm used for non-directories).  Yeah, I agree that
keeping track of the last flex group and using it first makes sense.

I do think it makes sense to start at the beginning of each flex
group; checking each block group is pretty fast, since we're not
scanning the bitmap; just checking a count value.  Even if there are
128 groups for flex groups, we're in big trouble if checking free
inodes count for 100+ groups is noticeable.

							- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 2d516c0..4ce2187 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -122,6 +122,9 @@  struct ext4_inode_info {
 	struct list_head i_prealloc_list;
 	spinlock_t i_prealloc_lock;
 
+	/* ialloc */
+	ext4_group_t	i_last_alloc_group;
+
 	/* allocation reservation info for delalloc */
 	unsigned int i_reserved_data_blocks;
 	unsigned int i_reserved_meta_blocks;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e2eab19..0aeb8c2 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -152,6 +152,8 @@  static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 	ext4_fsblk_t bg_start;
 	ext4_fsblk_t last_block;
 	ext4_grpblk_t colour;
+	ext4_group_t block_group;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
 	int depth;
 
 	if (path) {
@@ -170,10 +172,23 @@  static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 	}
 
 	/* OK. use inode's group */
-	bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+	block_group = ei->i_block_group;
+	if (flex_size >= 4) {
+		block_group &= ~(flex_size-1);
+		if (S_ISREG(inode->i_mode))
+			block_group++;
+	}
+	bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
 		le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
 	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 
+	/* 
+	 * If we are doing delayed allocation, we don't need take
+	 * colour into account.
+	 */
+	if (test_opt(inode->i_sb, DELALLOC))
+		return bg_start;
+
 	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
 		colour = (current->pid % 16) *
 			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 21080ab..212e5be 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -409,6 +409,42 @@  out:
 }
 
 /*
+ * Helper function for Orlov's allocator; returns critical information 
+ * for a particular block group or flex_bg
+ */
+struct orlov_stats {
+	__u32 free_inodes;
+	__u32 free_blocks;
+	__u32 used_dirs;  
+};
+
+void get_orlov_stats(struct super_block *sb, ext4_group_t g, 
+		       int flex_size, struct orlov_stats *stats)
+{
+	struct ext4_group_desc *desc;
+	ext4_group_t		ngroups = EXT4_SB(sb)->s_groups_count;
+	int			i;
+
+	stats->free_inodes = 0;
+	stats->free_blocks = 0;
+	stats->used_dirs = 0;
+
+	g *= flex_size;
+
+	for (i = 0; i < flex_size; i++) {
+		if (g >= ngroups)
+			break;
+		desc = ext4_get_group_desc(sb, g++, NULL);
+		if (!desc)
+			continue;
+
+		stats->free_inodes += ext4_free_inodes_count(sb, desc);
+		stats->free_blocks += ext4_free_blks_count(sb, desc);
+		stats->used_dirs += ext4_used_dirs_count(sb, desc);
+	}
+}
+
+/*
  * Orlov's allocator for directories.
  *
  * We always try to spread first-level directories.
@@ -423,7 +459,6 @@  out:
  * it has too many directories already (max_dirs) or
  * it has too few free inodes left (min_inodes) or
  * it has too few free blocks left (min_blocks) or
- * it's already running too large debt (max_debt).
  * Parent's group is preferred, if it doesn't satisfy these
  * conditions we search cyclically through the rest. If none
  * of the groups look good we just look for a group with more
@@ -437,7 +472,7 @@  out:
 #define BLOCK_COST 256
 
 static int find_group_orlov(struct super_block *sb, struct inode *parent,
-				ext4_group_t *group)
+			    ext4_group_t *group, int mode)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -446,12 +481,21 @@  static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
 	unsigned int freei, avefreei;
 	ext4_fsblk_t freeb, avefreeb;
-	ext4_fsblk_t blocks_per_dir;
 	unsigned int ndirs;
-	int max_debt, max_dirs, min_inodes;
+	int max_dirs, min_inodes;
 	ext4_grpblk_t min_blocks;
-	ext4_group_t i;
+	ext4_group_t i, grp, g;
 	struct ext4_group_desc *desc;
+	struct orlov_stats stats;
+	int flex_size = ext4_flex_bg_size(sbi);
+
+	if (flex_size < 4)
+		flex_size = 1;
+	else {
+		ngroups = (ngroups + flex_size - 1) >>
+			sbi->s_log_groups_per_flex;
+		parent_group >>= sbi->s_log_groups_per_flex;
+	}
 
 	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
 	avefreei = freei / ngroups;
@@ -460,71 +504,85 @@  static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	do_div(avefreeb, ngroups);
 	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
 
-	if ((parent == sb->s_root->d_inode) ||
-	    (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
+	if (S_ISDIR(mode) &&
+	    ((parent == sb->s_root->d_inode) ||
+	     (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
 		int best_ndir = inodes_per_group;
-		ext4_group_t grp;
 		int ret = -1;
 
 		get_random_bytes(&grp, sizeof(grp));
 		parent_group = (unsigned)grp % ngroups;
 		for (i = 0; i < ngroups; i++) {
-			grp = (parent_group + i) % ngroups;
-			desc = ext4_get_group_desc(sb, grp, NULL);
-			if (!desc || !ext4_free_inodes_count(sb, desc))
+			g = (parent_group + i) % ngroups;
+			get_orlov_stats(sb, g, flex_size, &stats);
+			if (!stats.free_inodes)
 				continue;
-			if (ext4_used_dirs_count(sb, desc) >= best_ndir)
+			if (stats.used_dirs >= best_ndir)
 				continue;
-			if (ext4_free_inodes_count(sb, desc) < avefreei)
+			if (stats.free_inodes < avefreei)
 				continue;
-			if (ext4_free_blks_count(sb, desc) < avefreeb)
+			if (stats.free_blocks < avefreeb)
 				continue;
-			*group = grp;
+			grp = g;
 			ret = 0;
-			best_ndir = ext4_used_dirs_count(sb, desc);
+			best_ndir = stats.used_dirs;
+		}
+		if (ret)
+			goto fallback;
+	found_flex_bg:
+		if (flex_size == 1) {
+			*group = grp;
+			return 0;
+		}
+
+		grp *= flex_size;
+		for (i = 1; i < flex_size; i++) {
+			if (grp+i >= sbi->s_groups_count)
+				break;
+			desc = ext4_get_group_desc(sb, grp+i, NULL);
+			if (desc && ext4_free_inodes_count(sb, desc)) {
+				*group = grp+i;
+				return 0;
+			}
+		}
+		desc = ext4_get_group_desc(sb, grp, NULL);
+		if (desc && ext4_free_inodes_count(sb, desc)) {
+			*group = grp;
+			return 0;
 		}
-		if (ret == 0)
-			return ret;
 		goto fallback;
 	}
 
-	blocks_per_dir = ext4_blocks_count(es) - freeb;
-	do_div(blocks_per_dir, ndirs);
-
 	max_dirs = ndirs / ngroups + inodes_per_group / 16;
-	min_inodes = avefreei - inodes_per_group / 4;
-	min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
-
-	max_debt = EXT4_BLOCKS_PER_GROUP(sb);
-	max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
-	if (max_debt * INODE_COST > inodes_per_group)
-		max_debt = inodes_per_group / INODE_COST;
-	if (max_debt > 255)
-		max_debt = 255;
-	if (max_debt == 0)
-		max_debt = 1;
+	min_inodes = avefreei - inodes_per_group*flex_size / 4;
+	if (min_inodes < 1)
+		min_inodes = 1;
+	min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
 
 	for (i = 0; i < ngroups; i++) {
-		*group = (parent_group + i) % ngroups;
-		desc = ext4_get_group_desc(sb, *group, NULL);
-		if (!desc || !ext4_free_inodes_count(sb, desc))
+		grp = (parent_group + i) % ngroups;
+		get_orlov_stats(sb, grp, flex_size, &stats);
+		if (stats.used_dirs >= max_dirs)
 			continue;
-		if (ext4_used_dirs_count(sb, desc) >= max_dirs)
+		if (stats.free_inodes < min_inodes)
 			continue;
-		if (ext4_free_inodes_count(sb, desc) < min_inodes)
+		if (stats.free_blocks < min_blocks)
 			continue;
-		if (ext4_free_blks_count(sb, desc) < min_blocks)
-			continue;
-		return 0;
+		goto found_flex_bg;
 	}
 
 fallback:
+	ngroups = sbi->s_groups_count;
+	avefreei = freei / ngroups;
+	parent_group = EXT4_I(parent)->i_block_group;
 	for (i = 0; i < ngroups; i++) {
-		*group = (parent_group + i) % ngroups;
-		desc = ext4_get_group_desc(sb, *group, NULL);
+		grp = (parent_group + i) % ngroups;
+		desc = ext4_get_group_desc(sb, grp, NULL);
 		if (desc && ext4_free_inodes_count(sb, desc) &&
-			ext4_free_inodes_count(sb, desc) >= avefreei)
+		    ext4_free_inodes_count(sb, desc) >= avefreei) {
+			*group = grp;
 			return 0;
+		}
 	}
 
 	if (avefreei) {
@@ -540,12 +598,54 @@  fallback:
 }
 
 static int find_group_other(struct super_block *sb, struct inode *parent,
-				ext4_group_t *group)
+			    ext4_group_t *group, int mode)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
 	struct ext4_group_desc *desc;
-	ext4_group_t i;
+	ext4_group_t i, last;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
+
+	/*
+	 * If we are doing flex_bg style allocation, try to put
+	 * special inodes in the first block group; start files and
+	 * directories at the 2nd block group in the flex_bg.
+	 */
+	if (flex_size >= 4) {
+		int ret, retry = 0;
+
+	try_again:
+		i = (parent_group & ~(flex_size-1));
+		last = i+flex_size;
+		if (last > ngroups)
+			last = ngroups;
+		if (S_ISDIR(mode) || S_ISREG(mode))
+			i++;
+		while (i < last) {
+			desc = ext4_get_group_desc(sb, i, NULL);
+			if (desc && ext4_free_inodes_count(sb, desc)) {
+				*group = i;
+				return 0;
+			}
+			i++;
+		}
+		if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
+			retry = 1;
+			parent_group = EXT4_I(parent)->i_last_alloc_group;
+			goto try_again;
+		}
+		/*
+		 * If this didn't work, use the Orlov search
+		 * algorithm; we pass in the mode to avoid the subdir
+		 * of topdir algorithms.  If successful, save the
+		 * group that we used so that future allocations in
+		 * that directory are stable.
+		 */
+		ret = find_group_orlov(sb, parent, &parent_group, mode);
+		if (ret == 0)
+			EXT4_I(parent)->i_last_alloc_group = parent_group;
+		return ret;
+	}
 
 	/*
 	 * Try to place the inode in its parent directory
@@ -713,10 +813,10 @@  struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 	sbi = EXT4_SB(sb);
 	es = sbi->s_es;
 
-	if (sbi->s_log_groups_per_flex) {
+	if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
 		ret2 = find_group_flex(sb, dir, &group);
 		if (ret2 == -1) {
-			ret2 = find_group_other(sb, dir, &group);
+			ret2 = find_group_other(sb, dir, &group, mode);
 			if (ret2 == 0)
 				printk(KERN_NOTICE "ext4: find_group_flex "
 				       "failed, fallback succeeded dir %lu\n",
@@ -729,9 +829,9 @@  struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 		if (test_opt(sb, OLDALLOC))
 			ret2 = find_group_dir(sb, dir, &group);
 		else
-			ret2 = find_group_orlov(sb, dir, &group);
+			ret2 = find_group_orlov(sb, dir, &group, mode);
 	} else
-		ret2 = find_group_other(sb, dir, &group);
+		ret2 = find_group_other(sb, dir, &group, mode);
 
 got_group:
 	err = -ENOSPC;
@@ -890,6 +990,7 @@  got:
 	ei->i_file_acl = 0;
 	ei->i_dtime = 0;
 	ei->i_block_group = group;
+	ei->i_last_alloc_group = ~0;
 
 	ext4_set_inode_flags(inode);
 	if (IS_DIRSYNC(inode))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cbd2ca9..12f1374 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -459,6 +459,8 @@  static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 	ext4_fsblk_t bg_start;
 	ext4_fsblk_t last_block;
 	ext4_grpblk_t colour;
+	ext4_group_t block_group;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
 
 	/* Try to find previous block */
 	for (p = ind->p - 1; p >= start; p--) {
@@ -474,9 +476,22 @@  static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 	 * It is going to be referred to from the inode itself? OK, just put it
 	 * into the same cylinder group then.
 	 */
-	bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
+	block_group = ei->i_block_group;
+	if (flex_size >= 4) {
+		block_group &= ~(flex_size-1);
+		if (S_ISREG(inode->i_mode))
+			block_group++;
+	}
+	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
 	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 
+	/* 
+	 * If we are doing delayed allocation, we don't need take
+	 * colour into account.
+	 */
+	if (test_opt(inode->i_sb, DELALLOC))
+		return bg_start;
+
 	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
 		colour = (current->pid % 16) *
 			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -4250,6 +4265,7 @@  struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	ei->i_disksize = inode->i_size;
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 	ei->i_block_group = iloc.block_group;
+	ei->i_last_alloc_group = ~0;
 	/*
 	 * NOTE! The in-memory inode i_data array is in little-endian order
 	 * even on big-endian machines: we do NOT byteswap the block numbers!