Message ID | 20090218154310.GH3600@mini-me.lan |
---|---|
State | Accepted, archived |
Headers | show |
4 files changed, 186 insertions(+), 51 deletions(-) > > diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h > index 2d516c0..4ce2187 100644 > --- a/fs/ext4/ext4_i.h > +++ b/fs/ext4/ext4_i.h > @@ -122,6 +122,9 @@ struct ext4_inode_info { > struct list_head i_prealloc_list; > spinlock_t i_prealloc_lock; > > + /* ialloc */ > + ext4_group_t i_last_alloc_group; > + > /* allocation reservation info for delalloc */ > unsigned int i_reserved_data_blocks; > unsigned int i_reserved_meta_blocks; > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c > index e2eab19..0aeb8c2 100644 > --- a/fs/ext4/extents.c > +++ b/fs/ext4/extents.c > @@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, > ext4_fsblk_t bg_start; > ext4_fsblk_t last_block; > ext4_grpblk_t colour; > + ext4_group_t block_group; > + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); > int depth; > > if (path) { > @@ -170,10 +172,23 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, > } > > /* OK. use inode's group */ > - bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + > + block_group = ei->i_block_group; > + if (flex_size >= 4) { > + block_group &= ~(flex_size-1); > + if (S_ISREG(inode->i_mode)) > + block_group++; > + } Can you explain why we select 4 here ? Also add a comment explaining directory/symlink block allocation goes to first group of flex_bg and regular files goes to second group and which type of workload that would help ? You have the comment in commit message. > + bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + > le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); > last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; > > + /* > + * If we are doing delayed allocation, we don't need take > + * colour into account. > + */ > + if (test_opt(inode->i_sb, DELALLOC)) > + return bg_start; > + Again why we don't want to look at colour for delayed allocation ? > if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) > colour = (current->pid % 16) * > (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); > diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c > index 21080ab..212e5be 100644 > --- a/fs/ext4/ialloc.c > +++ b/fs/ext4/ialloc.c > @@ -409,6 +409,42 @@ out: > } > > /* > + * Helper function for Orlov's allocator; returns critical information > + * for a particular block group or flex_bg > + */ > +struct orlov_stats { > + __u32 free_inodes; > + __u32 free_blocks; > + __u32 used_dirs; > +}; > + > +void get_orlov_stats(struct super_block *sb, ext4_group_t g, > + int flex_size, struct orlov_stats *stats) > +{ > + struct ext4_group_desc *desc; > + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; > + int i; > + > + stats->free_inodes = 0; > + stats->free_blocks = 0; > + stats->used_dirs = 0; > + > + g *= flex_size; Can you add a comment to the function saying g can be flex group number or the actual group number depending on flex_size ?. Without that comment the above operation can be confusing. > + > + for (i = 0; i < flex_size; i++) { > + if (g >= ngroups) > + break; > + desc = ext4_get_group_desc(sb, g++, NULL); > + if (!desc) > + continue; > + > + stats->free_inodes += ext4_free_inodes_count(sb, desc); > + stats->free_blocks += ext4_free_blks_count(sb, desc); > + stats->used_dirs += ext4_used_dirs_count(sb, desc); > + } > +} > + > +/* > * Orlov's allocator for directories. > * > * We always try to spread first-level directories. > @@ -423,7 +459,6 @@ out: > * it has too many directories already (max_dirs) or > * it has too few free inodes left (min_inodes) or > * it has too few free blocks left (min_blocks) or > - * it's already running too large debt (max_debt). > * Parent's group is preferred, if it doesn't satisfy these > * conditions we search cyclically through the rest. If none > * of the groups look good we just look for a group with more > @@ -437,7 +472,7 @@ out: > #define BLOCK_COST 256 You can also remove further description about debt and INODE_COST and BLOCK_COST > > static int find_group_orlov(struct super_block *sb, struct inode *parent, > - ext4_group_t *group) > + ext4_group_t *group, int mode) > { > ext4_group_t parent_group = EXT4_I(parent)->i_block_group; > struct ext4_sb_info *sbi = EXT4_SB(sb); > @@ -446,12 +481,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, > int inodes_per_group = EXT4_INODES_PER_GROUP(sb); > unsigned int freei, avefreei; > ext4_fsblk_t freeb, avefreeb; > - ext4_fsblk_t blocks_per_dir; > unsigned int ndirs; > - int max_debt, max_dirs, min_inodes; > + int max_dirs, min_inodes; > ext4_grpblk_t min_blocks; > - ext4_group_t i; > + ext4_group_t i, grp, g; > struct ext4_group_desc *desc; > + struct orlov_stats stats; > + int flex_size = ext4_flex_bg_size(sbi); > + > + if (flex_size < 4) > + flex_size = 1; > + else { > + ngroups = (ngroups + flex_size - 1) >> > + sbi->s_log_groups_per_flex; > + parent_group >>= sbi->s_log_groups_per_flex; > + } > > freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); > avefreei = freei / ngroups; > @@ -460,71 +504,85 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, > do_div(avefreeb, ngroups); > ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); > > - if ((parent == sb->s_root->d_inode) || > - (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) { > + if (S_ISDIR(mode) && > + ((parent == sb->s_root->d_inode) || > + (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) { > int best_ndir = inodes_per_group; > - ext4_group_t grp; > int ret = -1; > > get_random_bytes(&grp, sizeof(grp)); > parent_group = (unsigned)grp % ngroups; > for (i = 0; i < ngroups; i++) { > - grp = (parent_group + i) % ngroups; > - desc = ext4_get_group_desc(sb, grp, NULL); > - if (!desc || !ext4_free_inodes_count(sb, desc)) > + g = (parent_group + i) % ngroups; > + get_orlov_stats(sb, g, flex_size, &stats); > + if (!stats.free_inodes) > continue; > - if (ext4_used_dirs_count(sb, desc) >= best_ndir) > + if (stats.used_dirs >= best_ndir) > continue; > - if (ext4_free_inodes_count(sb, desc) < avefreei) > + if (stats.free_inodes < avefreei) > continue; > - if (ext4_free_blks_count(sb, desc) < avefreeb) > + if (stats.free_blocks < avefreeb) > continue; > - *group = grp; > + grp = g; > ret = 0; > - best_ndir = ext4_used_dirs_count(sb, desc); > + best_ndir = stats.used_dirs; > + } > + if (ret) > + goto fallback; > + found_flex_bg: > + if (flex_size == 1) { > + *group = grp; > + return 0; > + } > + > + grp *= flex_size; > + for (i = 1; i < flex_size; i++) { Why we start with i = 1 ? > + if (grp+i >= sbi->s_groups_count) > + break; > + desc = ext4_get_group_desc(sb, grp+i, NULL); > + if (desc && ext4_free_inodes_count(sb, desc)) { Can you add a comment saying that we just pick the first group with free inode because the goal block for rest of the block allocation of the file/directory looks at the flex block group number with flex_bg. (more details on ext4_ext_find_goal) > + *group = grp+i; > + return 0; > + } > + } > + desc = ext4_get_group_desc(sb, grp, NULL); > + if (desc && ext4_free_inodes_count(sb, desc)) { > + *group = grp; > + return 0; > } > - if (ret == 0) > - return ret; > goto fallback; > } > > - blocks_per_dir = ext4_blocks_count(es) - freeb; > - do_div(blocks_per_dir, ndirs); > - > max_dirs = ndirs / ngroups + inodes_per_group / 16; > - min_inodes = avefreei - inodes_per_group / 4; > - min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4; > - > - max_debt = EXT4_BLOCKS_PER_GROUP(sb); > - max_debt /= max_t(int, blocks_per_dir, BLOCK_COST); > - if (max_debt * INODE_COST > inodes_per_group) > - max_debt = inodes_per_group / INODE_COST; > - if (max_debt > 255) > - max_debt = 255; > - if (max_debt == 0) > - max_debt = 1; > + min_inodes = avefreei - inodes_per_group*flex_size / 4; > + if (min_inodes < 1) > + min_inodes = 1; > + min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4; > > for (i = 0; i < ngroups; i++) { > - *group = (parent_group + i) % ngroups; > - desc = ext4_get_group_desc(sb, *group, NULL); > - if (!desc || !ext4_free_inodes_count(sb, desc)) > + grp = (parent_group + i) % ngroups; > + get_orlov_stats(sb, grp, flex_size, &stats); > + if (stats.used_dirs >= max_dirs) > continue; > - if (ext4_used_dirs_count(sb, desc) >= max_dirs) > + if (stats.free_inodes < min_inodes) > continue; > - if (ext4_free_inodes_count(sb, desc) < min_inodes) > + if (stats.free_blocks < min_blocks) > continue; > - if (ext4_free_blks_count(sb, desc) < min_blocks) > - continue; > - return 0; > + goto found_flex_bg; > } > > fallback: > + ngroups = sbi->s_groups_count; > + avefreei = freei / ngroups; > + parent_group = EXT4_I(parent)->i_block_group; > for (i = 0; i < ngroups; i++) { > - *group = (parent_group + i) % ngroups; > - desc = ext4_get_group_desc(sb, *group, NULL); > + grp = (parent_group + i) % ngroups; > + desc = ext4_get_group_desc(sb, grp, NULL); > if (desc && ext4_free_inodes_count(sb, desc) && > - ext4_free_inodes_count(sb, desc) >= avefreei) > + ext4_free_inodes_count(sb, desc) >= avefreei) { > + *group = grp; > return 0; > + } > } > > if (avefreei) { > @@ -540,12 +598,54 @@ fallback: > } > > static int find_group_other(struct super_block *sb, struct inode *parent, > - ext4_group_t *group) > + ext4_group_t *group, int mode) > { > ext4_group_t parent_group = EXT4_I(parent)->i_block_group; > ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; > struct ext4_group_desc *desc; > - ext4_group_t i; > + ext4_group_t i, last; > + int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); > + > + /* > + * If we are doing flex_bg style allocation, try to put > + * special inodes in the first block group; start files and > + * directories at the 2nd block group in the flex_bg. > + */ Why ? Can you explain whether this placing helps any specific work load ? or something where you have observed that this placement helps ? > + if (flex_size >= 4) { > + int ret, retry = 0; > + > + try_again: > + i = (parent_group & ~(flex_size-1)); > + last = i+flex_size; > + if (last > ngroups) > + last = ngroups; > + if (S_ISDIR(mode) || S_ISREG(mode)) > + i++; > + while (i < last) { > + desc = ext4_get_group_desc(sb, i, NULL); > + if (desc && ext4_free_inodes_count(sb, desc)) { > + *group = i; > + return 0; > + } > + i++; > + } > + if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) { > + retry = 1; > + parent_group = EXT4_I(parent)->i_last_alloc_group; > + goto try_again; > + } > + /* > + * If this didn't work, use the Orlov search > + * algorithm; we pass in the mode to avoid the subdir > + * of topdir algorithms. If successful, save the > + * group that we used so that future allocations in > + * that directory are stable. > + */ > + ret = find_group_orlov(sb, parent, &parent_group, mode); > + if (ret == 0) > + EXT4_I(parent)->i_last_alloc_group = parent_group; > + return ret; > + } > > /* > * Try to place the inode in its parent directory > @@ -713,10 +813,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) > sbi = EXT4_SB(sb); > es = sbi->s_es; > > - if (sbi->s_log_groups_per_flex) { > + if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { > ret2 = find_group_flex(sb, dir, &group); > if (ret2 == -1) { > - ret2 = find_group_other(sb, dir, &group); > + ret2 = find_group_other(sb, dir, &group, mode); > if (ret2 == 0) > printk(KERN_NOTICE "ext4: find_group_flex " > "failed, fallback succeeded dir %lu\n", > @@ -729,9 +829,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) > if (test_opt(sb, OLDALLOC)) > ret2 = find_group_dir(sb, dir, &group); > else > - ret2 = find_group_orlov(sb, dir, &group); > + ret2 = find_group_orlov(sb, dir, &group, mode); > } else > - ret2 = find_group_other(sb, dir, &group); > + ret2 = find_group_other(sb, dir, &group, mode); > > got_group: > err = -ENOSPC; > @@ -890,6 +990,7 @@ got: > ei->i_file_acl = 0; > ei->i_dtime = 0; > ei->i_block_group = group; > + ei->i_last_alloc_group = ~0; > > ext4_set_inode_flags(inode); > if (IS_DIRSYNC(inode)) > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index cbd2ca9..12f1374 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -459,6 +459,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) > ext4_fsblk_t bg_start; > ext4_fsblk_t last_block; > ext4_grpblk_t colour; > + ext4_group_t block_group; > + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); > > /* Try to find previous block */ > for (p = ind->p - 1; p >= start; p--) { > @@ -474,9 +476,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) > * It is going to be referred to from the inode itself? OK, just put it > * into the same cylinder group then. > */ > - bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); > + block_group = ei->i_block_group; > + if (flex_size >= 4) { > + block_group &= ~(flex_size-1); > + if (S_ISREG(inode->i_mode)) > + block_group++; > + } > + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); > last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; > > + /* > + * If we are doing delayed allocation, we don't need take > + * colour into account. > + */ > + if (test_opt(inode->i_sb, DELALLOC)) > + return bg_start; > + > if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) > colour = (current->pid % 16) * > (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); > @@ -4250,6 +4265,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) > ei->i_disksize = inode->i_size; > inode->i_generation = le32_to_cpu(raw_inode->i_generation); > ei->i_block_group = iloc.block_group; > + ei->i_last_alloc_group = ~0; > /* > * NOTE! The in-memory inode i_data array is in little-endian order > * even on big-endian machines: we do NOT byteswap the block numbers! > -- -aneesh -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Feb 24, 2009 at 02:29:31PM +0530, Aneesh Kumar K.V wrote: > > /* OK. use inode's group */ > > - bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + > > + block_group = ei->i_block_group; > > + if (flex_size >= 4) { > > + block_group &= ~(flex_size-1); > > + if (S_ISREG(inode->i_mode)) > > + block_group++; > > + } > > > Can you explain why we select 4 here ? > > Also add a comment explaining directory/symlink block allocation goes to > first group of flex_bg and regular files goes to second group and which > type of workload that would help ? You have the comment in commit message. Yeah, I'll add a comment there. > > + /* > > + * If we are doing delayed allocation, we don't need take > > + * colour into account. > > + */ > > + if (test_opt(inode->i_sb, DELALLOC)) > > + return bg_start; > > + > > Again why we don't want to look at colour for delayed allocation ? If we're doing delayed allocation, we'll be allocating data blocks in large chunks at a time. Colour is much more important if you have multiple processes allocating singleton blocks at a time, so you don't get interleved allocation (i.e, ABABAB or ABCABBCABC). But if we're grabbing chunks of blocks at a time, the benefits largely go away, and in fact, artificially starting at different location depending on the process id can actually lead to a greater fragmentation of the free space. > > /* > > + * Helper function for Orlov's allocator; returns critical information > > + * for a particular block group or flex_bg > > + */ > > +struct orlov_stats { > > + __u32 free_inodes; > > + __u32 free_blocks; > > + __u32 used_dirs; > > +}; > > + > > +void get_orlov_stats(struct super_block *sb, ext4_group_t g, > > + int flex_size, struct orlov_stats *stats) > > +{ ... > > + g *= flex_size; > > Can you add a comment to the function saying g can be flex group number > or the actual group number depending on flex_size ?. Without that > comment the above operation can be confusing. Sure. > > +/* > > * Orlov's allocator for directories. > > * > > You can also remove further description about debt and INODE_COST and > BLOCK_COST > Yep, good point. > > + found_flex_bg: > > + if (flex_size == 1) { > > + *group = grp; > > + return 0; > > + } > > + > > + grp *= flex_size; > > + for (i = 1; i < flex_size; i++) { > > Why we start with i = 1 ? > I was putting directories in the second bg of flexgroups; thinking about it some more, there's really no good reason to do that. I'll change that back to be one. > Can you add a comment saying that we just pick the first group with > free inode because the goal block for rest of the block allocation > of the file/directory looks at the flex block group number with > flex_bg. (more details on ext4_ext_find_goal) I'll do that. > > + /* > > + * If we are doing flex_bg style allocation, try to put > > + * special inodes in the first block group; start files and > > + * directories at the 2nd block group in the flex_bg. > > + */ > > Why ? Can you explain whether this placing helps any specific work load > ? or something where you have observed that this placement helps ? This was left over from when I was using the inode number to influence block allocation. We're not doing this any more, so this should go away. Thanks for asking the question. - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Feb 24, 2009 at 10:27:34AM -0500, Theodore Tso wrote: > > > + /* > > > + * If we are doing flex_bg style allocation, try to put > > > + * special inodes in the first block group; start files and > > > + * directories at the 2nd block group in the flex_bg. > > > + */ > > > > Why ? Can you explain whether this placing helps any specific work load > > ? or something where you have observed that this placement helps ? > > This was left over from when I was using the inode number to influence > block allocation. We're not doing this any more, so this should go > away. Thanks for asking the question. Hm, I just tried taking it out, and it costs a 17% increase in e2fsck time on my test filesystem. The reason is pass 2, we need to check to make sure the filetype information in the directory blocks is correct. If the inode in question is a regular file or a directory, we can determine that by looking at an inode bitmap. However, if it is a named pipe, device file, or symlink, we can only determine what it is by reading the inode. In the filesystem in question, which is an Ubuntu Intrepid image, there are 5 charater device files, 1 block device file, 5 named pipes --- and 20,613 symbolic links. If we group all of these inodes togehter, it saves about 3 seconds in pass 2 (13 seconds versus 17 seconds). We can also solve this problem by caching the file type information. For example, I could store a list of all symlink inodes, and if there are only 20k symlinks, it will end up costing us 80k of memory. So this might be a problem which is better solved with some e2fsck hacking (which has the advantage that it will speed up ext3 fsck runs as well). - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Ted Ts'o wrote something like the following (didnt' get original email): > @@ -122,6 +122,9 @@ struct ext4_inode_info { > struct list_head i_prealloc_list; > spinlock_t i_prealloc_lock; > > + /* ialloc */ > + ext4_group_t i_last_alloc_group; Even better would be to store i_last_alloc_inode. In the past Eric has demonstrated workloads that are allocating lots of inodes exhibit O(n^2) behaviour because the entire group bitmap is searched from the start each time, and that can cumulatively be very slow. Having the directory start searching from the most recently allocated inode would make this O(n), and would not significantly alter behaviour. If the inode is newly read from disk this would be initialized to 0, or the inode number of the directory. If it was recently creating files it should be the previously allocated inode. The only issue is with inodes being repeatedly created and deleted in the same directory, in which case we could reset this at delete time to be max(first inode in group, directory inode) so that the same inodes keep being reused by the directory. This could be limited to clearing last_alloc_inode if the inode being freed is in the parent group. Hmm, it seems something like this is already done in find_group_orlov(). More discussion there. > @@ -170,10 +172,23 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, > + if (flex_size >= 4) { > + block_group &= ~(flex_size-1); > + if (S_ISREG(inode->i_mode)) > + block_group++; > + } The usage of "4" in several places to threshold flexbg size is also confusing to me. > + bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + > le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); > last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; Similar to the above proposal, Eric and I discussed storing the most recently selected (flex) group in the superblock would avoid repeated group scanning for allocations in separate directories. > @@ -446,12 +481,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, > + if (flex_size < 4) > + flex_size = 1; > + else { Generally CodingStyle has { } around the first "if" clause if they are around the "else" clause. > @@ -460,71 +504,85 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, > + if (S_ISDIR(mode) && > + ((parent == sb->s_root->d_inode) || > + (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) { > > get_random_bytes(&grp, sizeof(grp)); > parent_group = (unsigned)grp % ngroups; > for (i = 0; i < ngroups; i++) { > + g = (parent_group + i) % ngroups; > + get_orlov_stats(sb, g, flex_size, &stats); Two notes here - - get_random_bytes() is actually a fairly CPU-intensive operation, though I'm not sure it is significant in this context - more importantly, scanning ALL groups in a very large filesystem to find an optimal group. Large filesystems can have upwards of 60-100k groups to scan, and even with flexbg it seems that get_orlov_stats() is not using the flex_group[] stats in the common case where flex_size is the same as the flexbg size, but recalculating these for every allocation. > + if (!stats.free_inodes) > continue; > + if (stats.used_dirs >= best_ndir) > continue; > + if (stats.free_inodes < avefreei) > continue; > + if (stats.free_blocks < avefreeb) > continue; > + grp = g; > ret = 0; > + best_ndir = stats.used_dirs; Based on the above, it would be preferable to settle for a "local" optimal group after scanning N successful candidates, then checking the "most recently selected" group saved in the superblock if it is still better than the local optimum. Of course it should continue to scan all groups if there are no suitable groups yet found. Since the starting group is random, this will avoid being stuck in a bad local minimum, while avoiding an exhaustive search when there are a lot of groups. > + min_inodes = avefreei - inodes_per_group*flex_size / 4; > + min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4; style - "inodes_per_group * flex_size / 4" > static int find_group_other(struct super_block *sb, struct inode *parent, > + ext4_group_t *group, int mode) > { > + /* > + * If we are doing flex_bg style allocation, try to put > + * special inodes in the first block group; start files and > + * directories at the 2nd block group in the flex_bg. > + */ > + if (flex_size >= 4) { > + int ret, retry = 0; > + > + try_again: > + i = (parent_group & ~(flex_size-1)); > + last = i+flex_size; Can you please use a better variable name than "i", "grp", or "new_group" or similar? > + while (i < last) { > + desc = ext4_get_group_desc(sb, i, NULL); > + if (desc && ext4_free_inodes_count(sb, desc)) { > + *group = i; > + return 0; > + } > + i++; > + } > + if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) { > + retry = 1; > + parent_group = EXT4_I(parent)->i_last_alloc_group; > + goto try_again; > + } It would seem that using i_last_alloc_group FIRST (if set) would be better than always checking all groups in the flexbg. Either i_last_alloc_group is in the same flex group as parent_group (so no harm done) or a previous allocation wasn't able to find any free inode in that flex group and it had to do a more expensive search elsewhere. All that needs to be changed is to possibly reset i_last_alloc_group to be parent_group when there is a file unlinked from the directory, though this wouldn't be great for huge directories since any unlink would cause a full scan. Hmm, maybe checking ONLY the parent group for free inodes, then the i_last_alloc_group, then remaining groups in the flex group, then all remaining groups is the best ordering. It allows inodes to be clustered if there is space in the parent group (whether due to freeing inodes in this directory or others), but avoids a potentially lengthy scan for each inode (which might be noticable if there are 64 or 128 groups per flexbg). Another possibility is if the starting goal inode was based on the group a majority of dirent inode numbers in the same directory leaf. This would require deferring inode number selection until the dirent is about to be inserted (i.e. delalloc for inodes) by moving the inode selection into ext4_add_entry(). Combining this with a hash->inode range mapping (as I've proposed in the past, range size based on the directory size) would improve the htree->inode ordering and would also improve performance for large directories. It might not be a bad idea to separate the inode selection from the inode initialization anyway, as ext4_new_inode() is pretty huge already, and I can't see anything except the i_ino setting and insert_inode_hash() that is use the inode number. That means it should be possible to defer the inode selection to ext4_add_entry() and ext4_dx_add_entry() once the leaf block is found. Cheers, Andreas -- Andreas Dilger Sr. Staff Engineer, Lustre Group Sun Microsystems of Canada, Inc. -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Andreas Dilger wrote: > Ted Ts'o wrote something like the following (didnt' get original email): >> @@ -122,6 +122,9 @@ struct ext4_inode_info { >> struct list_head i_prealloc_list; >> spinlock_t i_prealloc_lock; >> >> + /* ialloc */ >> + ext4_group_t i_last_alloc_group; > > Even better would be to store i_last_alloc_inode. In the past Eric > has demonstrated workloads that are allocating lots of inodes exhibit > O(n^2) behaviour because the entire group bitmap is searched from the > start each time, and that can cumulatively be very slow. Having the > directory start searching from the most recently allocated inode would > make this O(n), and would not significantly alter behaviour. A very hacky benchmark I had to demonstrate this is at It just creates a directory tree starting at 000/ under the dir it's run in, and times iterations of creates. The tree is created in order, like: 000/000/000/000/000/000 000/000/000/000/000/001 000/000/000/000/000/002 ... 000/000/000/000/000/fff 000/000/000/000/001/000 .... On ext3: # ./seq_mkdirs iter 0: 6.191491 sec iter 1: 8.455782 sec iter 2: 9.435375 sec iter 3: 10.198069 sec iter 4: 10.922969 sec iter 5: 10.800908 sec iter 6: 12.940676 sec iter 7: 15.513261 sec ... On upstream ext4: # ./seq_mkdirs iter 0: 5.628331 sec iter 1: 6.581043 sec iter 2: 6.723445 sec iter 3: 6.567891 sec iter 4: 5.862526 sec iter 5: 6.462064 sec iter 6: 7.208110 sec iter 7: 6.549735 sec ... I did play with saving the last-allocated position but if that's just in-memory then it's a little odd that the first allocation will be potentially much slower, but that's probably acceptable. It also wouldn't fill in gaps when inodes are deleted if you don't re-search from the parent. ISTR that the constant create/delete didn't cause a problem, will need to remind myself why ... -Eric -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Eric Sandeen wrote: > Andreas Dilger wrote: >> Ted Ts'o wrote something like the following (didnt' get original email): >>> @@ -122,6 +122,9 @@ struct ext4_inode_info { >>> struct list_head i_prealloc_list; >>> spinlock_t i_prealloc_lock; >>> >>> + /* ialloc */ >>> + ext4_group_t i_last_alloc_group; >> Even better would be to store i_last_alloc_inode. In the past Eric >> has demonstrated workloads that are allocating lots of inodes exhibit >> O(n^2) behaviour because the entire group bitmap is searched from the >> start each time, and that can cumulatively be very slow. Having the >> directory start searching from the most recently allocated inode would >> make this O(n), and would not significantly alter behaviour. > > A very hacky benchmark I had to demonstrate this is at Er, URL please, maestro... http://people.redhat.com/esandeen/benchmarks/seq_mkdirs.c > It just creates a directory tree starting at 000/ under the dir it's run > in, and times iterations of creates. > > The tree is created in order, like: > > 000/000/000/000/000/000 > 000/000/000/000/000/001 > 000/000/000/000/000/002 > ... > 000/000/000/000/000/fff > 000/000/000/000/001/000 > .... > > On ext3: > > # ./seq_mkdirs > iter 0: 6.191491 sec > iter 1: 8.455782 sec > iter 2: 9.435375 sec > iter 3: 10.198069 sec > iter 4: 10.922969 sec > iter 5: 10.800908 sec > iter 6: 12.940676 sec > iter 7: 15.513261 sec > ... > > On upstream ext4: > > # ./seq_mkdirs > iter 0: 5.628331 sec > iter 1: 6.581043 sec > iter 2: 6.723445 sec > iter 3: 6.567891 sec > iter 4: 5.862526 sec > iter 5: 6.462064 sec > iter 6: 7.208110 sec > iter 7: 6.549735 sec > ... > > > I did play with saving the last-allocated position but if that's just > in-memory then it's a little odd that the first allocation will be > potentially much slower, but that's probably acceptable. It also > wouldn't fill in gaps when inodes are deleted if you don't re-search > from the parent. ISTR that the constant create/delete didn't cause a > problem, will need to remind myself why ... > > -Eric > -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> Two notes here - > - get_random_bytes() is actually a fairly CPU-intensive operation, though > I'm not sure it is significant in this context > - more importantly, scanning ALL groups in a very large filesystem to find > an optimal group. Large filesystems can have upwards of 60-100k groups > to scan, and even with flexbg it seems that get_orlov_stats() is not > using the flex_group[] stats in the common case where flex_size is the > same as the flexbg size, but recalculating these for every allocation. Note that this only happens when we are creating directories. I'm making the assumption here that creating directories is rare, and certainly happens less often than allocating inodes or blocks. I was deliberately not using flex_group[] stats, since I was thinking that eventually it would be worthwhile to nuke the need to take a spinlock for every inode and block allocation, and it wasn't keeping track of the number of directories, anyway. We also don't actually scan ALL the groups. For top-level directories we do, yes. But most directories aren't top-level; and for non top-level directories, we find the first block group which meets the minimum criteria: for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; get_orlov_stats(sb, grp, flex_size, &stats); if (stats.used_dirs >= max_dirs) continue; if (stats.free_inodes < min_inodes) continue; if (stats.free_blocks < min_blocks) continue; goto found_flex_bg; } Looking at Eric's benchmark, it looks like each iteration is creating 196,608 directories for each iteration. If takes 6 seconds, then each mkdir is taking 30 microseconds, and if takes 15 seconds to do an interation, then each mkdir is taking 76 microseconds. So that variance is large, but how bad does it really get? Also, what workload is likely going to be creating tons and tons of directories? Even a psycho squid cache hierarchy is only 65536 directories. > Based on the above, it would be preferable to settle for a "local" > optimal group after scanning N successful candidates, then checking > the "most recently selected" group saved in the superblock if it is > still better than the local optimum. Of course it should continue to > scan all groups if there are no suitable groups yet found. > > Since the starting group is random, this will avoid being stuck in > a bad local minimum, while avoiding an exhaustive search when there > are a lot of groups. The above algorithm is only used for top-level directories. > It would seem that using i_last_alloc_group FIRST (if set) would be better > than always checking all groups in the flexbg. Either i_last_alloc_group > is in the same flex group as parent_group (so no harm done) or a previous > allocation wasn't able to find any free inode in that flex group and it > had to do a more expensive search elsewhere. (This is the algorithm used for non-directories). Yeah, I agree that keeping track of the last flex group and using it first makes sense. I do think it makes sense to start at the beginning of each flex group; checking each block group is pretty fast, since we're not scanning the bitmap; just checking a count value. Even if there are 128 groups for flex groups, we're in big trouble if checking free inodes count for 100+ groups is noticeable. - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h index 2d516c0..4ce2187 100644 --- a/fs/ext4/ext4_i.h +++ b/fs/ext4/ext4_i.h @@ -122,6 +122,9 @@ struct ext4_inode_info { struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; + /* ialloc */ + ext4_group_t i_last_alloc_group; + /* allocation reservation info for delalloc */ unsigned int i_reserved_data_blocks; unsigned int i_reserved_meta_blocks; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e2eab19..0aeb8c2 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, ext4_fsblk_t bg_start; ext4_fsblk_t last_block; ext4_grpblk_t colour; + ext4_group_t block_group; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); int depth; if (path) { @@ -170,10 +172,23 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, } /* OK. use inode's group */ - bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + + block_group = ei->i_block_group; + if (flex_size >= 4) { + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) colour = (current->pid % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 21080ab..212e5be 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -409,6 +409,42 @@ out: } /* + * Helper function for Orlov's allocator; returns critical information + * for a particular block group or flex_bg + */ +struct orlov_stats { + __u32 free_inodes; + __u32 free_blocks; + __u32 used_dirs; +}; + +void get_orlov_stats(struct super_block *sb, ext4_group_t g, + int flex_size, struct orlov_stats *stats) +{ + struct ext4_group_desc *desc; + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + int i; + + stats->free_inodes = 0; + stats->free_blocks = 0; + stats->used_dirs = 0; + + g *= flex_size; + + for (i = 0; i < flex_size; i++) { + if (g >= ngroups) + break; + desc = ext4_get_group_desc(sb, g++, NULL); + if (!desc) + continue; + + stats->free_inodes += ext4_free_inodes_count(sb, desc); + stats->free_blocks += ext4_free_blks_count(sb, desc); + stats->used_dirs += ext4_used_dirs_count(sb, desc); + } +} + +/* * Orlov's allocator for directories. * * We always try to spread first-level directories. @@ -423,7 +459,6 @@ out: * it has too many directories already (max_dirs) or * it has too few free inodes left (min_inodes) or * it has too few free blocks left (min_blocks) or - * it's already running too large debt (max_debt). * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more @@ -437,7 +472,7 @@ out: #define BLOCK_COST 256 static int find_group_orlov(struct super_block *sb, struct inode *parent, - ext4_group_t *group) + ext4_group_t *group, int mode) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -446,12 +481,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, int inodes_per_group = EXT4_INODES_PER_GROUP(sb); unsigned int freei, avefreei; ext4_fsblk_t freeb, avefreeb; - ext4_fsblk_t blocks_per_dir; unsigned int ndirs; - int max_debt, max_dirs, min_inodes; + int max_dirs, min_inodes; ext4_grpblk_t min_blocks; - ext4_group_t i; + ext4_group_t i, grp, g; struct ext4_group_desc *desc; + struct orlov_stats stats; + int flex_size = ext4_flex_bg_size(sbi); + + if (flex_size < 4) + flex_size = 1; + else { + ngroups = (ngroups + flex_size - 1) >> + sbi->s_log_groups_per_flex; + parent_group >>= sbi->s_log_groups_per_flex; + } freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); avefreei = freei / ngroups; @@ -460,71 +504,85 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, do_div(avefreeb, ngroups); ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); - if ((parent == sb->s_root->d_inode) || - (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) { + if (S_ISDIR(mode) && + ((parent == sb->s_root->d_inode) || + (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) { int best_ndir = inodes_per_group; - ext4_group_t grp; int ret = -1; get_random_bytes(&grp, sizeof(grp)); parent_group = (unsigned)grp % ngroups; for (i = 0; i < ngroups; i++) { - grp = (parent_group + i) % ngroups; - desc = ext4_get_group_desc(sb, grp, NULL); - if (!desc || !ext4_free_inodes_count(sb, desc)) + g = (parent_group + i) % ngroups; + get_orlov_stats(sb, g, flex_size, &stats); + if (!stats.free_inodes) continue; - if (ext4_used_dirs_count(sb, desc) >= best_ndir) + if (stats.used_dirs >= best_ndir) continue; - if (ext4_free_inodes_count(sb, desc) < avefreei) + if (stats.free_inodes < avefreei) continue; - if (ext4_free_blks_count(sb, desc) < avefreeb) + if (stats.free_blocks < avefreeb) continue; - *group = grp; + grp = g; ret = 0; - best_ndir = ext4_used_dirs_count(sb, desc); + best_ndir = stats.used_dirs; + } + if (ret) + goto fallback; + found_flex_bg: + if (flex_size == 1) { + *group = grp; + return 0; + } + + grp *= flex_size; + for (i = 1; i < flex_size; i++) { + if (grp+i >= sbi->s_groups_count) + break; + desc = ext4_get_group_desc(sb, grp+i, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) { + *group = grp+i; + return 0; + } + } + desc = ext4_get_group_desc(sb, grp, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) { + *group = grp; + return 0; } - if (ret == 0) - return ret; goto fallback; } - blocks_per_dir = ext4_blocks_count(es) - freeb; - do_div(blocks_per_dir, ndirs); - max_dirs = ndirs / ngroups + inodes_per_group / 16; - min_inodes = avefreei - inodes_per_group / 4; - min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4; - - max_debt = EXT4_BLOCKS_PER_GROUP(sb); - max_debt /= max_t(int, blocks_per_dir, BLOCK_COST); - if (max_debt * INODE_COST > inodes_per_group) - max_debt = inodes_per_group / INODE_COST; - if (max_debt > 255) - max_debt = 255; - if (max_debt == 0) - max_debt = 1; + min_inodes = avefreei - inodes_per_group*flex_size / 4; + if (min_inodes < 1) + min_inodes = 1; + min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4; for (i = 0; i < ngroups; i++) { - *group = (parent_group + i) % ngroups; - desc = ext4_get_group_desc(sb, *group, NULL); - if (!desc || !ext4_free_inodes_count(sb, desc)) + grp = (parent_group + i) % ngroups; + get_orlov_stats(sb, grp, flex_size, &stats); + if (stats.used_dirs >= max_dirs) continue; - if (ext4_used_dirs_count(sb, desc) >= max_dirs) + if (stats.free_inodes < min_inodes) continue; - if (ext4_free_inodes_count(sb, desc) < min_inodes) + if (stats.free_blocks < min_blocks) continue; - if (ext4_free_blks_count(sb, desc) < min_blocks) - continue; - return 0; + goto found_flex_bg; } fallback: + ngroups = sbi->s_groups_count; + avefreei = freei / ngroups; + parent_group = EXT4_I(parent)->i_block_group; for (i = 0; i < ngroups; i++) { - *group = (parent_group + i) % ngroups; - desc = ext4_get_group_desc(sb, *group, NULL); + grp = (parent_group + i) % ngroups; + desc = ext4_get_group_desc(sb, grp, NULL); if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_inodes_count(sb, desc) >= avefreei) + ext4_free_inodes_count(sb, desc) >= avefreei) { + *group = grp; return 0; + } } if (avefreei) { @@ -540,12 +598,54 @@ fallback: } static int find_group_other(struct super_block *sb, struct inode *parent, - ext4_group_t *group) + ext4_group_t *group, int mode) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; struct ext4_group_desc *desc; - ext4_group_t i; + ext4_group_t i, last; + int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); + + /* + * If we are doing flex_bg style allocation, try to put + * special inodes in the first block group; start files and + * directories at the 2nd block group in the flex_bg. + */ + if (flex_size >= 4) { + int ret, retry = 0; + + try_again: + i = (parent_group & ~(flex_size-1)); + last = i+flex_size; + if (last > ngroups) + last = ngroups; + if (S_ISDIR(mode) || S_ISREG(mode)) + i++; + while (i < last) { + desc = ext4_get_group_desc(sb, i, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) { + *group = i; + return 0; + } + i++; + } + if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) { + retry = 1; + parent_group = EXT4_I(parent)->i_last_alloc_group; + goto try_again; + } + /* + * If this didn't work, use the Orlov search + * algorithm; we pass in the mode to avoid the subdir + * of topdir algorithms. If successful, save the + * group that we used so that future allocations in + * that directory are stable. + */ + ret = find_group_orlov(sb, parent, &parent_group, mode); + if (ret == 0) + EXT4_I(parent)->i_last_alloc_group = parent_group; + return ret; + } /* * Try to place the inode in its parent directory @@ -713,10 +813,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) sbi = EXT4_SB(sb); es = sbi->s_es; - if (sbi->s_log_groups_per_flex) { + if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { ret2 = find_group_flex(sb, dir, &group); if (ret2 == -1) { - ret2 = find_group_other(sb, dir, &group); + ret2 = find_group_other(sb, dir, &group, mode); if (ret2 == 0) printk(KERN_NOTICE "ext4: find_group_flex " "failed, fallback succeeded dir %lu\n", @@ -729,9 +829,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) if (test_opt(sb, OLDALLOC)) ret2 = find_group_dir(sb, dir, &group); else - ret2 = find_group_orlov(sb, dir, &group); + ret2 = find_group_orlov(sb, dir, &group, mode); } else - ret2 = find_group_other(sb, dir, &group); + ret2 = find_group_other(sb, dir, &group, mode); got_group: err = -ENOSPC; @@ -890,6 +990,7 @@ got: ei->i_file_acl = 0; ei->i_dtime = 0; ei->i_block_group = group; + ei->i_last_alloc_group = ~0; ext4_set_inode_flags(inode); if (IS_DIRSYNC(inode)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cbd2ca9..12f1374 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -459,6 +459,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) ext4_fsblk_t bg_start; ext4_fsblk_t last_block; ext4_grpblk_t colour; + ext4_group_t block_group; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); /* Try to find previous block */ for (p = ind->p - 1; p >= start; p--) { @@ -474,9 +476,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) * It is going to be referred to from the inode itself? OK, just put it * into the same cylinder group then. */ - bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); + block_group = ei->i_block_group; + if (flex_size >= 4) { + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) colour = (current->pid % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); @@ -4250,6 +4265,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_disksize = inode->i_size; inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; + ei->i_last_alloc_group = ~0; /* * NOTE! The in-memory inode i_data array is in little-endian order * even on big-endian machines: we do NOT byteswap the block numbers!
The find_group_flex() inode allocator is now only used if the filesystem is mounted using the "oldalloc" mount option. It is replaced with the original Orlov allocator that has been updated for flex_bg filesystems (it should behave the same way if flex_bg is disabled). The inode allocator now functions by taking into account each flex_bg group, instead of each block group, when deciding whether or not it's time to allocate a new directory into a fresh flex_bg. The block allocator has also been changed so that the first block group in each flex_bg is preferred for use for storing directory blocks. This keeps directory blocks close together, which is good for speeding up e2fsck since large directories are more likely to look like this: debugfs: stat /home/tytso/Maildir/cur Inode: 1844562 Type: directory Mode: 0700 Flags: 0x81000 Generation: 1132745781 Version: 0x00000000:0000ad71 User: 15806 Group: 15806 Size: 1060864 File ACL: 0 Directory ACL: 0 Links: 2 Blockcount: 2072 Fragment: Address: 0 Number: 0 Size: 0 ctime: 0x499c0ff4:164961f4 -- Wed Feb 18 08:41:08 2009 atime: 0x499c0ff4:00000000 -- Wed Feb 18 08:41:08 2009 mtime: 0x49957f51:00000000 -- Fri Feb 13 09:10:25 2009 crtime: 0x499c0f57:00d51440 -- Wed Feb 18 08:38:31 2009 Size of extra inode fields: 28 BLOCKS: (0):7348651, (1-258):7348654-7348911 TOTAL: 259 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/ext4_i.h | 3 + fs/ext4/extents.c | 17 ++++- fs/ext4/ialloc.c | 199 ++++++++++++++++++++++++++++++++++++++++------------- fs/ext4/inode.c | 18 +++++- 4 files changed, 186 insertions(+), 51 deletions(-)