Message ID | 1489657877-34478-1-git-send-email-artem.blagodarenko@seagate.com |
---|---|
State | Superseded |
Headers | show |
On Mar 16, 2017, at 3:51 AM, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote: > > From: Artem Blagodarenko <artem.blagodarenko@gmail.com> > > This INCOMPAT_LARGEDIR feature allows larger directories to be created > in ldiskfs, both with directory sizes over 2GB and and a maximum htree > depth of 3 instead of the current limit of 2. These features are needed > in order to exceed the current limit of approximately 10M entries in a > single directory. Artem, many thanks for updating and submitting this, and creating the e2fsck patches. Ted, can you please add the original author for the largedir ext4 patch when landing this patch: Liang Zhen <liang.zhen@intel.com> and these tags which contain more background information on this feature: Lustre-change: https://review.hpdd.intel.com/375 Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-50 > Signed-off-by: Yang Sheng <yang.sheng@intel.com> > Signed-off-by: Artem Blagodarenko <artem.blagodarenko@seagate.com> Reviewed-by: Andreas Dilger <andreas.dilger@intel.com> > --- > fs/ext4/ext4.h | 23 ++++++++--- > fs/ext4/inode.c | 4 +- > fs/ext4/namei.c | 118 +++++++++++++++++++++++++++++++++++++++----------------- > 3 files changed, 102 insertions(+), 43 deletions(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 01d52b9..0bbbd9b 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) > EXT4_FEATURE_INCOMPAT_MMP | \ > EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ > EXT4_FEATURE_INCOMPAT_ENCRYPT | \ > - EXT4_FEATURE_INCOMPAT_CSUM_SEED) > + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ > + EXT4_FEATURE_INCOMPAT_LARGEDIR) > #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ > EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ > EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ > @@ -2125,6 +2126,16 @@ struct dir_private_info { > */ > #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) > > +/* htree levels for ext4 */ > +#define EXT4_HTREE_LEVEL_COMPAT 2 > +#define EXT4_HTREE_LEVEL 3 > + > +static inline int ext4_dir_htree_level(struct super_block *sb) > +{ > + return ext4_has_feature_largedir(sb) ? > + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; > +} > + > /* > * Timeout and state flag for lazy initialization inode thread. > */ > @@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, > es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); > } > > -static inline loff_t ext4_isize(struct ext4_inode *raw_inode) > +static inline loff_t ext4_isize(struct super_block *sb, > + struct ext4_inode *raw_inode) > { > - if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) > + if (ext4_has_feature_largedir(sb) || > + S_ISREG(le16_to_cpu(raw_inode->i_mode))) > return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | > le32_to_cpu(raw_inode->i_size_lo); > - else > - return (loff_t) le32_to_cpu(raw_inode->i_size_lo); > + > + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); > } > > static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index f622d4a..5787f3d 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) > if (ext4_has_feature_64bit(sb)) > ei->i_file_acl |= > ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; > - inode->i_size = ext4_isize(raw_inode); > + inode->i_size = ext4_isize(sb, raw_inode); > if ((size = i_size_read(inode)) < 0) { > EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); > ret = -EFSCORRUPTED; > @@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle, > raw_inode->i_file_acl_high = > cpu_to_le16(ei->i_file_acl >> 32); > raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); > - if (ei->i_disksize != ext4_isize(raw_inode)) { > + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { > ext4_isize_set(raw_inode, ei->i_disksize); > need_datasync = 1; > } > diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c > index 6ad612c..3298fe3 100644 > --- a/fs/ext4/namei.c > +++ b/fs/ext4/namei.c > @@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle, > > static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) > { > - return le32_to_cpu(entry->block) & 0x00ffffff; > + return le32_to_cpu(entry->block) & 0x0fffffff; > } It wouldn't be terrible to add a comment to this function explaining why the block numbers are masked off, which is to allow the high bits of the logical block number to hold a "fullness" fraction so that the kernel could start shrinking directories if many files are deleted. That said, this isn't a shortcoming of this patch, but a suggestion if the patch is being resubmitted for some other reason. > static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) > @@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, > struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); > u32 hash; > > + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); > frame->bh = ext4_read_dirblock(dir, 0, INDEX); > if (IS_ERR(frame->bh)) > return (struct dx_frame *) frame->bh; > @@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, > } > > indirect = root->info.indirect_levels; > - if (indirect > 1) { > - ext4_warning_inode(dir, "Unimplemented hash depth: %#06x", > - root->info.indirect_levels); > + if (indirect >= ext4_dir_htree_level(dir->i_sb)) { > + ext4_warning(dir->i_sb, > + "Directory (ino: %lu) htree depth %#06x exceed" > + "supported value", dir->i_ino, > + ext4_dir_htree_level(dir->i_sb)); > + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { > + ext4_warning(dir->i_sb, "Enable large directory " > + "feature to access it"); > + } > goto fail; > } > > @@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, > > static void dx_release(struct dx_frame *frames) > { > + struct dx_root_info *info; > + int i; > + > if (frames[0].bh == NULL) > return; > > - if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels) > - brelse(frames[1].bh); > - brelse(frames[0].bh); > + info = &((struct dx_root *)frames[0].bh->b_data)->info; > + for (i = 0; i <= info->indirect_levels; i++) { > + if (frames[i].bh == NULL) > + break; > + brelse(frames[i].bh); > + frames[i].bh = NULL; > + } > } > > /* > @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, > { > struct dx_hash_info hinfo; > struct ext4_dir_entry_2 *de; > - struct dx_frame frames[2], *frame; > + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; > struct inode *dir; > ext4_lblk_t block; > int count = 0; > @@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, > struct ext4_dir_entry_2 **res_dir) > { > struct super_block * sb = dir->i_sb; > - struct dx_frame frames[2], *frame; > + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; > const struct qstr *d_name = fname->usr_fname; > struct buffer_head *bh; > ext4_lblk_t block; > @@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, > */ > dir->i_mtime = dir->i_ctime = current_time(dir); > ext4_update_dx_flag(dir); > - dir->i_version++; > + inode_inc_iversion(dir); > ext4_mark_inode_dirty(handle, dir); > BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); > err = ext4_handle_dirty_dirent_node(handle, dir, bh); > @@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, > { > struct buffer_head *bh2; > struct dx_root *root; > - struct dx_frame frames[2], *frame; > + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; > struct dx_entry *entries; > struct ext4_dir_entry_2 *de, *de2; > struct ext4_dir_entry_tail *t; > @@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, > static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, > struct inode *dir, struct inode *inode) > { > - struct dx_frame frames[2], *frame; > + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; > struct dx_entry *entries, *at; > struct buffer_head *bh; > struct super_block *sb = dir->i_sb; > struct ext4_dir_entry_2 *de; > + int restart; > int err; > > +again: > + restart = 0; > frame = dx_probe(fname, dir, NULL, frames); > if (IS_ERR(frame)) > return PTR_ERR(frame); > @@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, > if (err != -ENOSPC) > goto cleanup; > > + err = 0; > /* Block full, should compress but for now just split */ > dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", > dx_get_count(entries), dx_get_limit(entries))); > /* Need to split index? */ > if (dx_get_count(entries) == dx_get_limit(entries)) { > ext4_lblk_t newblock; > - unsigned icount = dx_get_count(entries); > - int levels = frame - frames; > + int levels = frame - frames + 1; > + unsigned int icount; > + int add_level = 1; > struct dx_entry *entries2; > struct dx_node *node2; > struct buffer_head *bh2; > > - if (levels && (dx_get_count(frames->entries) == > - dx_get_limit(frames->entries))) { > - ext4_warning_inode(dir, "Directory index full!"); > + while (frame > frames) { > + if (dx_get_count((frame - 1)->entries) < > + dx_get_limit((frame - 1)->entries)) { > + add_level = 0; > + break; > + } > + frame--; /* split higher index block */ > + at = frame->at; > + entries = frame->entries; > + restart = 1; > + } > + if (add_level && levels == ext4_dir_htree_level(sb)) { > + ext4_warning(sb, "Directory (ino: %lu) index full, " > + "reach max htree level :%d", > + dir->i_ino, levels); > + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { > + ext4_warning(sb, "Large directory feature is " > + "not enabled on this " > + "filesystem"); > + } > err = -ENOSPC; > goto cleanup; > } > + icount = dx_get_count(entries); > bh2 = ext4_append(handle, dir, &newblock); > if (IS_ERR(bh2)) { > err = PTR_ERR(bh2); > @@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, > err = ext4_journal_get_write_access(handle, frame->bh); > if (err) > goto journal_error; > - if (levels) { > + if (!add_level) { > unsigned icount1 = icount/2, icount2 = icount - icount1; > unsigned hash2 = dx_get_hash(entries + icount1); > dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", > @@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, > > BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ > err = ext4_journal_get_write_access(handle, > - frames[0].bh); > + (frame - 1)->bh); > if (err) > goto journal_error; > > @@ -2269,17 +2306,23 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, > frame->entries = entries = entries2; > swap(frame->bh, bh2); > } > - dx_insert_block(frames + 0, hash2, newblock); > - dxtrace(dx_show_index("node", frames[1].entries)); > + dx_insert_block((frame - 1), hash2, newblock); > + dxtrace(dx_show_index("node", frame->entries)); > dxtrace(dx_show_index("node", > ((struct dx_node *) bh2->b_data)->entries)); > err = ext4_handle_dirty_dx_node(handle, dir, bh2); > if (err) > goto journal_error; > brelse (bh2); > + ext4_handle_dirty_metadata(handle, dir, > + (frame - 1)->bh); > + if (restart) { > + ext4_handle_dirty_metadata(handle, dir, > + frame->bh); > + goto cleanup; > + } > } else { > - dxtrace(printk(KERN_DEBUG > - "Creating second level index...\n")); > + struct dx_root *dxroot; > memcpy((char *) entries2, (char *) entries, > icount * sizeof(struct dx_entry)); > dx_set_limit(entries2, dx_node_limit(dir)); > @@ -2287,19 +2330,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, > /* Set up root */ > dx_set_count(entries, 1); > dx_set_block(entries + 0, newblock); > - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; > - > - /* Add new access path frame */ > - frame = frames + 1; > - frame->at = at = at - entries + entries2; > - frame->entries = entries = entries2; > - frame->bh = bh2; > - err = ext4_journal_get_write_access(handle, > - frame->bh); > - if (err) > - goto journal_error; > + dxroot = (struct dx_root *)frames[0].bh->b_data; > + dxroot->info.indirect_levels += 1; > + dxtrace(printk(KERN_DEBUG > + "Creating %d level index...\n", > + info->indirect_levels)); > + ext4_handle_dirty_metadata(handle, dir, frame->bh); > + ext4_handle_dirty_metadata(handle, dir, bh2); > + brelse(bh2); > + restart = 1; > + goto cleanup; > } > - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); > if (err) { > ext4_std_error(inode->i_sb, err); > goto cleanup; > @@ -2318,6 +2359,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, > cleanup: > brelse(bh); > dx_release(frames); > + /* @restart is true means htree-path has been changed, we need to > + * repeat dx_probe() to find out valid htree-path > + */ > + if (restart && err == 0) > + goto again; > return err; > } > > @@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle, > blocksize); > else > de->inode = 0; > - dir->i_version++; > + inode_inc_iversion(dir); > return 0; > } > i += ext4_rec_len_from_disk(de->rec_len, blocksize); > -- > 1.8.3.1 > Cheers, Andreas
Andreas, I not strongly against it patch, but my testing say - 3 level h-tree need a more than 20mil files in directory, and creation rate was dropped dramatically, a specially without parallel dir operations, Did we need do some other research changes with landing it patch as disk format will changed anyway by incompatible way ? > 17 марта 2017 г., в 0:44, Andreas Dilger <adilger@dilger.ca> написал(а): > > On Mar 16, 2017, at 3:51 AM, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote: >> >> From: Artem Blagodarenko <artem.blagodarenko@gmail.com> >> >> This INCOMPAT_LARGEDIR feature allows larger directories to be created >> in ldiskfs, both with directory sizes over 2GB and and a maximum htree >> depth of 3 instead of the current limit of 2. These features are needed >> in order to exceed the current limit of approximately 10M entries in a >> single directory. > > Artem, > many thanks for updating and submitting this, and creating the e2fsck patches. > > Ted, > can you please add the original author for the largedir ext4 patch when > landing this patch: Liang Zhen <liang.zhen@intel.com> > > and these tags which contain more background information on this feature: > > Lustre-change: https://review.hpdd.intel.com/375 > Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-50 > >> Signed-off-by: Yang Sheng <yang.sheng@intel.com> >> Signed-off-by: Artem Blagodarenko <artem.blagodarenko@seagate.com> > > Reviewed-by: Andreas Dilger <andreas.dilger@intel.com> > >> --- >> fs/ext4/ext4.h | 23 ++++++++--- >> fs/ext4/inode.c | 4 +- >> fs/ext4/namei.c | 118 +++++++++++++++++++++++++++++++++++++++----------------- >> 3 files changed, 102 insertions(+), 43 deletions(-) >> >> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h >> index 01d52b9..0bbbd9b 100644 >> --- a/fs/ext4/ext4.h >> +++ b/fs/ext4/ext4.h >> @@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) >> EXT4_FEATURE_INCOMPAT_MMP | \ >> EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ >> EXT4_FEATURE_INCOMPAT_ENCRYPT | \ >> - EXT4_FEATURE_INCOMPAT_CSUM_SEED) >> + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ >> + EXT4_FEATURE_INCOMPAT_LARGEDIR) >> #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ >> EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ >> EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ >> @@ -2125,6 +2126,16 @@ struct dir_private_info { >> */ >> #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) >> >> +/* htree levels for ext4 */ >> +#define EXT4_HTREE_LEVEL_COMPAT 2 >> +#define EXT4_HTREE_LEVEL 3 >> + >> +static inline int ext4_dir_htree_level(struct super_block *sb) >> +{ >> + return ext4_has_feature_largedir(sb) ? >> + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; >> +} >> + >> /* >> * Timeout and state flag for lazy initialization inode thread. >> */ >> @@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, >> es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); >> } >> >> -static inline loff_t ext4_isize(struct ext4_inode *raw_inode) >> +static inline loff_t ext4_isize(struct super_block *sb, >> + struct ext4_inode *raw_inode) >> { >> - if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) >> + if (ext4_has_feature_largedir(sb) || >> + S_ISREG(le16_to_cpu(raw_inode->i_mode))) >> return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | >> le32_to_cpu(raw_inode->i_size_lo); >> - else >> - return (loff_t) le32_to_cpu(raw_inode->i_size_lo); >> + >> + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); >> } >> >> static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) >> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c >> index f622d4a..5787f3d 100644 >> --- a/fs/ext4/inode.c >> +++ b/fs/ext4/inode.c >> @@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) >> if (ext4_has_feature_64bit(sb)) >> ei->i_file_acl |= >> ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; >> - inode->i_size = ext4_isize(raw_inode); >> + inode->i_size = ext4_isize(sb, raw_inode); >> if ((size = i_size_read(inode)) < 0) { >> EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); >> ret = -EFSCORRUPTED; >> @@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle, >> raw_inode->i_file_acl_high = >> cpu_to_le16(ei->i_file_acl >> 32); >> raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); >> - if (ei->i_disksize != ext4_isize(raw_inode)) { >> + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { >> ext4_isize_set(raw_inode, ei->i_disksize); >> need_datasync = 1; >> } >> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c >> index 6ad612c..3298fe3 100644 >> --- a/fs/ext4/namei.c >> +++ b/fs/ext4/namei.c >> @@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle, >> >> static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) >> { >> - return le32_to_cpu(entry->block) & 0x00ffffff; >> + return le32_to_cpu(entry->block) & 0x0fffffff; >> } > > It wouldn't be terrible to add a comment to this function explaining why > the block numbers are masked off, which is to allow the high bits of the > logical block number to hold a "fullness" fraction so that the kernel > could start shrinking directories if many files are deleted. That said, > this isn't a shortcoming of this patch, but a suggestion if the patch is > being resubmitted for some other reason. > >> static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) >> @@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, >> struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); >> u32 hash; >> >> + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); >> frame->bh = ext4_read_dirblock(dir, 0, INDEX); >> if (IS_ERR(frame->bh)) >> return (struct dx_frame *) frame->bh; >> @@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, >> } >> >> indirect = root->info.indirect_levels; >> - if (indirect > 1) { >> - ext4_warning_inode(dir, "Unimplemented hash depth: %#06x", >> - root->info.indirect_levels); >> + if (indirect >= ext4_dir_htree_level(dir->i_sb)) { >> + ext4_warning(dir->i_sb, >> + "Directory (ino: %lu) htree depth %#06x exceed" >> + "supported value", dir->i_ino, >> + ext4_dir_htree_level(dir->i_sb)); >> + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { >> + ext4_warning(dir->i_sb, "Enable large directory " >> + "feature to access it"); >> + } >> goto fail; >> } >> >> @@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, >> >> static void dx_release(struct dx_frame *frames) >> { >> + struct dx_root_info *info; >> + int i; >> + >> if (frames[0].bh == NULL) >> return; >> >> - if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels) >> - brelse(frames[1].bh); >> - brelse(frames[0].bh); >> + info = &((struct dx_root *)frames[0].bh->b_data)->info; >> + for (i = 0; i <= info->indirect_levels; i++) { >> + if (frames[i].bh == NULL) >> + break; >> + brelse(frames[i].bh); >> + frames[i].bh = NULL; >> + } >> } >> >> /* >> @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, >> { >> struct dx_hash_info hinfo; >> struct ext4_dir_entry_2 *de; >> - struct dx_frame frames[2], *frame; >> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; >> struct inode *dir; >> ext4_lblk_t block; >> int count = 0; >> @@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, >> struct ext4_dir_entry_2 **res_dir) >> { >> struct super_block * sb = dir->i_sb; >> - struct dx_frame frames[2], *frame; >> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; >> const struct qstr *d_name = fname->usr_fname; >> struct buffer_head *bh; >> ext4_lblk_t block; >> @@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, >> */ >> dir->i_mtime = dir->i_ctime = current_time(dir); >> ext4_update_dx_flag(dir); >> - dir->i_version++; >> + inode_inc_iversion(dir); >> ext4_mark_inode_dirty(handle, dir); >> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); >> err = ext4_handle_dirty_dirent_node(handle, dir, bh); >> @@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, >> { >> struct buffer_head *bh2; >> struct dx_root *root; >> - struct dx_frame frames[2], *frame; >> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; >> struct dx_entry *entries; >> struct ext4_dir_entry_2 *de, *de2; >> struct ext4_dir_entry_tail *t; >> @@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, >> static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >> struct inode *dir, struct inode *inode) >> { >> - struct dx_frame frames[2], *frame; >> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; >> struct dx_entry *entries, *at; >> struct buffer_head *bh; >> struct super_block *sb = dir->i_sb; >> struct ext4_dir_entry_2 *de; >> + int restart; >> int err; >> >> +again: >> + restart = 0; >> frame = dx_probe(fname, dir, NULL, frames); >> if (IS_ERR(frame)) >> return PTR_ERR(frame); >> @@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >> if (err != -ENOSPC) >> goto cleanup; >> >> + err = 0; >> /* Block full, should compress but for now just split */ >> dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", >> dx_get_count(entries), dx_get_limit(entries))); >> /* Need to split index? */ >> if (dx_get_count(entries) == dx_get_limit(entries)) { >> ext4_lblk_t newblock; >> - unsigned icount = dx_get_count(entries); >> - int levels = frame - frames; >> + int levels = frame - frames + 1; >> + unsigned int icount; >> + int add_level = 1; >> struct dx_entry *entries2; >> struct dx_node *node2; >> struct buffer_head *bh2; >> >> - if (levels && (dx_get_count(frames->entries) == >> - dx_get_limit(frames->entries))) { >> - ext4_warning_inode(dir, "Directory index full!"); >> + while (frame > frames) { >> + if (dx_get_count((frame - 1)->entries) < >> + dx_get_limit((frame - 1)->entries)) { >> + add_level = 0; >> + break; >> + } >> + frame--; /* split higher index block */ >> + at = frame->at; >> + entries = frame->entries; >> + restart = 1; >> + } >> + if (add_level && levels == ext4_dir_htree_level(sb)) { >> + ext4_warning(sb, "Directory (ino: %lu) index full, " >> + "reach max htree level :%d", >> + dir->i_ino, levels); >> + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { >> + ext4_warning(sb, "Large directory feature is " >> + "not enabled on this " >> + "filesystem"); >> + } >> err = -ENOSPC; >> goto cleanup; >> } >> + icount = dx_get_count(entries); >> bh2 = ext4_append(handle, dir, &newblock); >> if (IS_ERR(bh2)) { >> err = PTR_ERR(bh2); >> @@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >> err = ext4_journal_get_write_access(handle, frame->bh); >> if (err) >> goto journal_error; >> - if (levels) { >> + if (!add_level) { >> unsigned icount1 = icount/2, icount2 = icount - icount1; >> unsigned hash2 = dx_get_hash(entries + icount1); >> dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", >> @@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >> >> BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ >> err = ext4_journal_get_write_access(handle, >> - frames[0].bh); >> + (frame - 1)->bh); >> if (err) >> goto journal_error; >> >> @@ -2269,17 +2306,23 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >> frame->entries = entries = entries2; >> swap(frame->bh, bh2); >> } >> - dx_insert_block(frames + 0, hash2, newblock); >> - dxtrace(dx_show_index("node", frames[1].entries)); >> + dx_insert_block((frame - 1), hash2, newblock); >> + dxtrace(dx_show_index("node", frame->entries)); >> dxtrace(dx_show_index("node", >> ((struct dx_node *) bh2->b_data)->entries)); >> err = ext4_handle_dirty_dx_node(handle, dir, bh2); >> if (err) >> goto journal_error; >> brelse (bh2); >> + ext4_handle_dirty_metadata(handle, dir, >> + (frame - 1)->bh); >> + if (restart) { >> + ext4_handle_dirty_metadata(handle, dir, >> + frame->bh); >> + goto cleanup; >> + } >> } else { >> - dxtrace(printk(KERN_DEBUG >> - "Creating second level index...\n")); >> + struct dx_root *dxroot; >> memcpy((char *) entries2, (char *) entries, >> icount * sizeof(struct dx_entry)); >> dx_set_limit(entries2, dx_node_limit(dir)); >> @@ -2287,19 +2330,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >> /* Set up root */ >> dx_set_count(entries, 1); >> dx_set_block(entries + 0, newblock); >> - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; >> - >> - /* Add new access path frame */ >> - frame = frames + 1; >> - frame->at = at = at - entries + entries2; >> - frame->entries = entries = entries2; >> - frame->bh = bh2; >> - err = ext4_journal_get_write_access(handle, >> - frame->bh); >> - if (err) >> - goto journal_error; >> + dxroot = (struct dx_root *)frames[0].bh->b_data; >> + dxroot->info.indirect_levels += 1; >> + dxtrace(printk(KERN_DEBUG >> + "Creating %d level index...\n", >> + info->indirect_levels)); >> + ext4_handle_dirty_metadata(handle, dir, frame->bh); >> + ext4_handle_dirty_metadata(handle, dir, bh2); >> + brelse(bh2); >> + restart = 1; >> + goto cleanup; >> } >> - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); >> if (err) { >> ext4_std_error(inode->i_sb, err); >> goto cleanup; >> @@ -2318,6 +2359,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >> cleanup: >> brelse(bh); >> dx_release(frames); >> + /* @restart is true means htree-path has been changed, we need to >> + * repeat dx_probe() to find out valid htree-path >> + */ >> + if (restart && err == 0) >> + goto again; >> return err; >> } >> >> @@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle, >> blocksize); >> else >> de->inode = 0; >> - dir->i_version++; >> + inode_inc_iversion(dir); >> return 0; >> } >> i += ext4_rec_len_from_disk(de->rec_len, blocksize); >> -- >> 1.8.3.1 >> > > > Cheers, Andreas > > > > >
On Mar 17, 2017, at 12:15 AM, Alexey Lyashkov <alexey.lyashkov@gmail.com> wrote: > > Andreas, > > I not strongly against it patch, but my testing say - 3 level h-tree need a more than 20mil files in directory, and creation rate was dropped dramatically, > a specially without parallel dir operations, Did we need do some other research changes with landing it patch as disk format will changed anyway by incompatible way ? Hi Alexey, I'm not against further improvements to large directory handling (e.g. directory shrinking, implement pdirops through VFS, etc.) for ext4. That said, anything that changes the disk format should use a different feature flag than EXT4_FEATURE_INCOMPAT_LARGEDIR, since we have been using this flag in the Lustre code for several years already. I think for very large directories that directory shrinking can be quite useful, and could actually be implemented without a format change since htree always masks off the top byte ("fullness ratio") of the logical block number for this reason, and removing leaf blocks does not change the other parts of the htree on-disk format. This could still be done (in a slightly less efficient manner) even without "fullness ratio" (i.e. if fullness == 0, unset from older versions of ext4/e2fsck). This would also help performance noticeably if a directory has many files deleted, as the htree index will shrink, and operations like readdir() will not have to skip empty blocks. IMHO, I think the 3-level htree maximum limits are as far as we need to go for ext4. With 4KB blocks this gives us a maximum directory size of about 5B entries, which is more files than can fit in the filesystem, Even for 1KB blocks the maximum directory size is about 8M entries, which is fine since 1KB blocksize is mostly only used for small filesystems, yet is still as good as what a 4KB filesystem can do today. Cheers, Andreas >> 17 марта 2017 г., в 0:44, Andreas Dilger <adilger@dilger.ca> написал(а): >> >> On Mar 16, 2017, at 3:51 AM, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote: >>> >>> From: Artem Blagodarenko <artem.blagodarenko@gmail.com> >>> >>> This INCOMPAT_LARGEDIR feature allows larger directories to be created >>> in ldiskfs, both with directory sizes over 2GB and and a maximum htree >>> depth of 3 instead of the current limit of 2. These features are needed >>> in order to exceed the current limit of approximately 10M entries in a >>> single directory. >> >> Artem, >> many thanks for updating and submitting this, and creating the e2fsck patches. >> >> Ted, >> can you please add the original author for the largedir ext4 patch when >> landing this patch: Liang Zhen <liang.zhen@intel.com> >> >> and these tags which contain more background information on this feature: >> >> Lustre-change: https://review.hpdd.intel.com/375 >> Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-50 >> >>> Signed-off-by: Yang Sheng <yang.sheng@intel.com> >>> Signed-off-by: Artem Blagodarenko <artem.blagodarenko@seagate.com> >> >> Reviewed-by: Andreas Dilger <andreas.dilger@intel.com> >> >>> --- >>> fs/ext4/ext4.h | 23 ++++++++--- >>> fs/ext4/inode.c | 4 +- >>> fs/ext4/namei.c | 118 +++++++++++++++++++++++++++++++++++++++----------------- >>> 3 files changed, 102 insertions(+), 43 deletions(-) >>> >>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h >>> index 01d52b9..0bbbd9b 100644 >>> --- a/fs/ext4/ext4.h >>> +++ b/fs/ext4/ext4.h >>> @@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) >>> EXT4_FEATURE_INCOMPAT_MMP | \ >>> EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ >>> EXT4_FEATURE_INCOMPAT_ENCRYPT | \ >>> - EXT4_FEATURE_INCOMPAT_CSUM_SEED) >>> + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ >>> + EXT4_FEATURE_INCOMPAT_LARGEDIR) >>> #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ >>> EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ >>> EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ >>> @@ -2125,6 +2126,16 @@ struct dir_private_info { >>> */ >>> #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) >>> >>> +/* htree levels for ext4 */ >>> +#define EXT4_HTREE_LEVEL_COMPAT 2 >>> +#define EXT4_HTREE_LEVEL 3 >>> + >>> +static inline int ext4_dir_htree_level(struct super_block *sb) >>> +{ >>> + return ext4_has_feature_largedir(sb) ? >>> + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; >>> +} >>> + >>> /* >>> * Timeout and state flag for lazy initialization inode thread. >>> */ >>> @@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, >>> es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); >>> } >>> >>> -static inline loff_t ext4_isize(struct ext4_inode *raw_inode) >>> +static inline loff_t ext4_isize(struct super_block *sb, >>> + struct ext4_inode *raw_inode) >>> { >>> - if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) >>> + if (ext4_has_feature_largedir(sb) || >>> + S_ISREG(le16_to_cpu(raw_inode->i_mode))) >>> return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | >>> le32_to_cpu(raw_inode->i_size_lo); >>> - else >>> - return (loff_t) le32_to_cpu(raw_inode->i_size_lo); >>> + >>> + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); >>> } >>> >>> static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) >>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c >>> index f622d4a..5787f3d 100644 >>> --- a/fs/ext4/inode.c >>> +++ b/fs/ext4/inode.c >>> @@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) >>> if (ext4_has_feature_64bit(sb)) >>> ei->i_file_acl |= >>> ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; >>> - inode->i_size = ext4_isize(raw_inode); >>> + inode->i_size = ext4_isize(sb, raw_inode); >>> if ((size = i_size_read(inode)) < 0) { >>> EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); >>> ret = -EFSCORRUPTED; >>> @@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle, >>> raw_inode->i_file_acl_high = >>> cpu_to_le16(ei->i_file_acl >> 32); >>> raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); >>> - if (ei->i_disksize != ext4_isize(raw_inode)) { >>> + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { >>> ext4_isize_set(raw_inode, ei->i_disksize); >>> need_datasync = 1; >>> } >>> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c >>> index 6ad612c..3298fe3 100644 >>> --- a/fs/ext4/namei.c >>> +++ b/fs/ext4/namei.c >>> @@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle, >>> >>> static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) >>> { >>> - return le32_to_cpu(entry->block) & 0x00ffffff; >>> + return le32_to_cpu(entry->block) & 0x0fffffff; >>> } >> >> It wouldn't be terrible to add a comment to this function explaining why >> the block numbers are masked off, which is to allow the high bits of the >> logical block number to hold a "fullness" fraction so that the kernel >> could start shrinking directories if many files are deleted. That said, >> this isn't a shortcoming of this patch, but a suggestion if the patch is >> being resubmitted for some other reason. >> >>> static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) >>> @@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, >>> struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); >>> u32 hash; >>> >>> + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); >>> frame->bh = ext4_read_dirblock(dir, 0, INDEX); >>> if (IS_ERR(frame->bh)) >>> return (struct dx_frame *) frame->bh; >>> @@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, >>> } >>> >>> indirect = root->info.indirect_levels; >>> - if (indirect > 1) { >>> - ext4_warning_inode(dir, "Unimplemented hash depth: %#06x", >>> - root->info.indirect_levels); >>> + if (indirect >= ext4_dir_htree_level(dir->i_sb)) { >>> + ext4_warning(dir->i_sb, >>> + "Directory (ino: %lu) htree depth %#06x exceed" >>> + "supported value", dir->i_ino, >>> + ext4_dir_htree_level(dir->i_sb)); >>> + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { >>> + ext4_warning(dir->i_sb, "Enable large directory " >>> + "feature to access it"); >>> + } >>> goto fail; >>> } >>> >>> @@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, >>> >>> static void dx_release(struct dx_frame *frames) >>> { >>> + struct dx_root_info *info; >>> + int i; >>> + >>> if (frames[0].bh == NULL) >>> return; >>> >>> - if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels) >>> - brelse(frames[1].bh); >>> - brelse(frames[0].bh); >>> + info = &((struct dx_root *)frames[0].bh->b_data)->info; >>> + for (i = 0; i <= info->indirect_levels; i++) { >>> + if (frames[i].bh == NULL) >>> + break; >>> + brelse(frames[i].bh); >>> + frames[i].bh = NULL; >>> + } >>> } >>> >>> /* >>> @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, >>> { >>> struct dx_hash_info hinfo; >>> struct ext4_dir_entry_2 *de; >>> - struct dx_frame frames[2], *frame; >>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; >>> struct inode *dir; >>> ext4_lblk_t block; >>> int count = 0; >>> @@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, >>> struct ext4_dir_entry_2 **res_dir) >>> { >>> struct super_block * sb = dir->i_sb; >>> - struct dx_frame frames[2], *frame; >>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; >>> const struct qstr *d_name = fname->usr_fname; >>> struct buffer_head *bh; >>> ext4_lblk_t block; >>> @@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, >>> */ >>> dir->i_mtime = dir->i_ctime = current_time(dir); >>> ext4_update_dx_flag(dir); >>> - dir->i_version++; >>> + inode_inc_iversion(dir); >>> ext4_mark_inode_dirty(handle, dir); >>> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); >>> err = ext4_handle_dirty_dirent_node(handle, dir, bh); >>> @@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, >>> { >>> struct buffer_head *bh2; >>> struct dx_root *root; >>> - struct dx_frame frames[2], *frame; >>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; >>> struct dx_entry *entries; >>> struct ext4_dir_entry_2 *de, *de2; >>> struct ext4_dir_entry_tail *t; >>> @@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, >>> static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >>> struct inode *dir, struct inode *inode) >>> { >>> - struct dx_frame frames[2], *frame; >>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; >>> struct dx_entry *entries, *at; >>> struct buffer_head *bh; >>> struct super_block *sb = dir->i_sb; >>> struct ext4_dir_entry_2 *de; >>> + int restart; >>> int err; >>> >>> +again: >>> + restart = 0; >>> frame = dx_probe(fname, dir, NULL, frames); >>> if (IS_ERR(frame)) >>> return PTR_ERR(frame); >>> @@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >>> if (err != -ENOSPC) >>> goto cleanup; >>> >>> + err = 0; >>> /* Block full, should compress but for now just split */ >>> dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", >>> dx_get_count(entries), dx_get_limit(entries))); >>> /* Need to split index? */ >>> if (dx_get_count(entries) == dx_get_limit(entries)) { >>> ext4_lblk_t newblock; >>> - unsigned icount = dx_get_count(entries); >>> - int levels = frame - frames; >>> + int levels = frame - frames + 1; >>> + unsigned int icount; >>> + int add_level = 1; >>> struct dx_entry *entries2; >>> struct dx_node *node2; >>> struct buffer_head *bh2; >>> >>> - if (levels && (dx_get_count(frames->entries) == >>> - dx_get_limit(frames->entries))) { >>> - ext4_warning_inode(dir, "Directory index full!"); >>> + while (frame > frames) { >>> + if (dx_get_count((frame - 1)->entries) < >>> + dx_get_limit((frame - 1)->entries)) { >>> + add_level = 0; >>> + break; >>> + } >>> + frame--; /* split higher index block */ >>> + at = frame->at; >>> + entries = frame->entries; >>> + restart = 1; >>> + } >>> + if (add_level && levels == ext4_dir_htree_level(sb)) { >>> + ext4_warning(sb, "Directory (ino: %lu) index full, " >>> + "reach max htree level :%d", >>> + dir->i_ino, levels); >>> + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { >>> + ext4_warning(sb, "Large directory feature is " >>> + "not enabled on this " >>> + "filesystem"); >>> + } >>> err = -ENOSPC; >>> goto cleanup; >>> } >>> + icount = dx_get_count(entries); >>> bh2 = ext4_append(handle, dir, &newblock); >>> if (IS_ERR(bh2)) { >>> err = PTR_ERR(bh2); >>> @@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >>> err = ext4_journal_get_write_access(handle, frame->bh); >>> if (err) >>> goto journal_error; >>> - if (levels) { >>> + if (!add_level) { >>> unsigned icount1 = icount/2, icount2 = icount - icount1; >>> unsigned hash2 = dx_get_hash(entries + icount1); >>> dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", >>> @@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >>> >>> BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ >>> err = ext4_journal_get_write_access(handle, >>> - frames[0].bh); >>> + (frame - 1)->bh); >>> if (err) >>> goto journal_error; >>> >>> @@ -2269,17 +2306,23 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >>> frame->entries = entries = entries2; >>> swap(frame->bh, bh2); >>> } >>> - dx_insert_block(frames + 0, hash2, newblock); >>> - dxtrace(dx_show_index("node", frames[1].entries)); >>> + dx_insert_block((frame - 1), hash2, newblock); >>> + dxtrace(dx_show_index("node", frame->entries)); >>> dxtrace(dx_show_index("node", >>> ((struct dx_node *) bh2->b_data)->entries)); >>> err = ext4_handle_dirty_dx_node(handle, dir, bh2); >>> if (err) >>> goto journal_error; >>> brelse (bh2); >>> + ext4_handle_dirty_metadata(handle, dir, >>> + (frame - 1)->bh); >>> + if (restart) { >>> + ext4_handle_dirty_metadata(handle, dir, >>> + frame->bh); >>> + goto cleanup; >>> + } >>> } else { >>> - dxtrace(printk(KERN_DEBUG >>> - "Creating second level index...\n")); >>> + struct dx_root *dxroot; >>> memcpy((char *) entries2, (char *) entries, >>> icount * sizeof(struct dx_entry)); >>> dx_set_limit(entries2, dx_node_limit(dir)); >>> @@ -2287,19 +2330,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >>> /* Set up root */ >>> dx_set_count(entries, 1); >>> dx_set_block(entries + 0, newblock); >>> - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; >>> - >>> - /* Add new access path frame */ >>> - frame = frames + 1; >>> - frame->at = at = at - entries + entries2; >>> - frame->entries = entries = entries2; >>> - frame->bh = bh2; >>> - err = ext4_journal_get_write_access(handle, >>> - frame->bh); >>> - if (err) >>> - goto journal_error; >>> + dxroot = (struct dx_root *)frames[0].bh->b_data; >>> + dxroot->info.indirect_levels += 1; >>> + dxtrace(printk(KERN_DEBUG >>> + "Creating %d level index...\n", >>> + info->indirect_levels)); >>> + ext4_handle_dirty_metadata(handle, dir, frame->bh); >>> + ext4_handle_dirty_metadata(handle, dir, bh2); >>> + brelse(bh2); >>> + restart = 1; >>> + goto cleanup; >>> } >>> - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); >>> if (err) { >>> ext4_std_error(inode->i_sb, err); >>> goto cleanup; >>> @@ -2318,6 +2359,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >>> cleanup: >>> brelse(bh); >>> dx_release(frames); >>> + /* @restart is true means htree-path has been changed, we need to >>> + * repeat dx_probe() to find out valid htree-path >>> + */ >>> + if (restart && err == 0) >>> + goto again; >>> return err; >>> } >>> >>> @@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle, >>> blocksize); >>> else >>> de->inode = 0; >>> - dir->i_version++; >>> + inode_inc_iversion(dir); >>> return 0; >>> } >>> i += ext4_rec_len_from_disk(de->rec_len, blocksize); >>> -- >>> 1.8.3.1 >>> >> >> >> Cheers, Andreas >> >> >> >> >> > Cheers, Andreas
Andreas, it not about a feature flag. It’s about a situation in whole. Yes, we may increase a directory size, but it open a number a large problems. 1) readdir. It tries to read all entries in memory before send to the user. currently it may eats 20*10^6 * 256 so several gigs, so increasing it size may produce a problems for a system. 2) inode allocation. Current code tries to allocate an inode as near as possible to the directory inode, but one GD may hold 32k entries only, so increase a directory size will use up 1k GD for now and more than it, after it. It increase a seek time with file allocation. It was i mean when say - «dramatically decrease a file creation rate». 3) current limit with 4G inodes - currently 32-128 directories may eats a full inode number space. From it perspective large dir don’t need to be used. Other improvements you point already. > 17 марта 2017 г., в 23:51, Andreas Dilger <adilger@dilger.ca> написал(а): > > On Mar 17, 2017, at 12:15 AM, Alexey Lyashkov <alexey.lyashkov@gmail.com> wrote: >> >> Andreas, >> >> I not strongly against it patch, but my testing say - 3 level h-tree need a more than 20mil files in directory, and creation rate was dropped dramatically, >> a specially without parallel dir operations, Did we need do some other research changes with landing it patch as disk format will changed anyway by incompatible way ? > > Hi Alexey, > I'm not against further improvements to large directory handling (e.g. > directory shrinking, implement pdirops through VFS, etc.) for ext4. > That said, anything that changes the disk format should use a different > feature flag than EXT4_FEATURE_INCOMPAT_LARGEDIR, since we have been > using this flag in the Lustre code for several years already. > > I think for very large directories that directory shrinking can be quite > useful, and could actually be implemented without a format change since > htree always masks off the top byte ("fullness ratio") of the logical block > number for this reason, and removing leaf blocks does not change the other > parts of the htree on-disk format. This could still be done (in a slightly > less efficient manner) even without "fullness ratio" (i.e. if fullness == 0, > unset from older versions of ext4/e2fsck). This would also help performance > noticeably if a directory has many files deleted, as the htree index will > shrink, and operations like readdir() will not have to skip empty blocks. > > IMHO, I think the 3-level htree maximum limits are as far as we need to > go for ext4. With 4KB blocks this gives us a maximum directory size > of about 5B entries, which is more files than can fit in the filesystem, > Even for 1KB blocks the maximum directory size is about 8M entries, which > is fine since 1KB blocksize is mostly only used for small filesystems, yet > is still as good as what a 4KB filesystem can do today. > > Cheers, Andreas > >>> 17 марта 2017 г., в 0:44, Andreas Dilger <adilger@dilger.ca> написал(а): >>> >>> On Mar 16, 2017, at 3:51 AM, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote: >>>> >>>> From: Artem Blagodarenko <artem.blagodarenko@gmail.com> >>>> >>>> This INCOMPAT_LARGEDIR feature allows larger directories to be created >>>> in ldiskfs, both with directory sizes over 2GB and and a maximum htree >>>> depth of 3 instead of the current limit of 2. These features are needed >>>> in order to exceed the current limit of approximately 10M entries in a >>>> single directory. >>> >>> Artem, >>> many thanks for updating and submitting this, and creating the e2fsck patches. >>> >>> Ted, >>> can you please add the original author for the largedir ext4 patch when >>> landing this patch: Liang Zhen <liang.zhen@intel.com> >>> >>> and these tags which contain more background information on this feature: >>> >>> Lustre-change: https://review.hpdd.intel.com/375 >>> Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-50 >>> >>>> Signed-off-by: Yang Sheng <yang.sheng@intel.com> >>>> Signed-off-by: Artem Blagodarenko <artem.blagodarenko@seagate.com> >>> >>> Reviewed-by: Andreas Dilger <andreas.dilger@intel.com> >>> >>>> --- >>>> fs/ext4/ext4.h | 23 ++++++++--- >>>> fs/ext4/inode.c | 4 +- >>>> fs/ext4/namei.c | 118 +++++++++++++++++++++++++++++++++++++++----------------- >>>> 3 files changed, 102 insertions(+), 43 deletions(-) >>>> >>>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h >>>> index 01d52b9..0bbbd9b 100644 >>>> --- a/fs/ext4/ext4.h >>>> +++ b/fs/ext4/ext4.h >>>> @@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) >>>> EXT4_FEATURE_INCOMPAT_MMP | \ >>>> EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ >>>> EXT4_FEATURE_INCOMPAT_ENCRYPT | \ >>>> - EXT4_FEATURE_INCOMPAT_CSUM_SEED) >>>> + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ >>>> + EXT4_FEATURE_INCOMPAT_LARGEDIR) >>>> #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ >>>> EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ >>>> EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ >>>> @@ -2125,6 +2126,16 @@ struct dir_private_info { >>>> */ >>>> #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) >>>> >>>> +/* htree levels for ext4 */ >>>> +#define EXT4_HTREE_LEVEL_COMPAT 2 >>>> +#define EXT4_HTREE_LEVEL 3 >>>> + >>>> +static inline int ext4_dir_htree_level(struct super_block *sb) >>>> +{ >>>> + return ext4_has_feature_largedir(sb) ? >>>> + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; >>>> +} >>>> + >>>> /* >>>> * Timeout and state flag for lazy initialization inode thread. >>>> */ >>>> @@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, >>>> es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); >>>> } >>>> >>>> -static inline loff_t ext4_isize(struct ext4_inode *raw_inode) >>>> +static inline loff_t ext4_isize(struct super_block *sb, >>>> + struct ext4_inode *raw_inode) >>>> { >>>> - if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) >>>> + if (ext4_has_feature_largedir(sb) || >>>> + S_ISREG(le16_to_cpu(raw_inode->i_mode))) >>>> return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | >>>> le32_to_cpu(raw_inode->i_size_lo); >>>> - else >>>> - return (loff_t) le32_to_cpu(raw_inode->i_size_lo); >>>> + >>>> + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); >>>> } >>>> >>>> static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) >>>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c >>>> index f622d4a..5787f3d 100644 >>>> --- a/fs/ext4/inode.c >>>> +++ b/fs/ext4/inode.c >>>> @@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) >>>> if (ext4_has_feature_64bit(sb)) >>>> ei->i_file_acl |= >>>> ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; >>>> - inode->i_size = ext4_isize(raw_inode); >>>> + inode->i_size = ext4_isize(sb, raw_inode); >>>> if ((size = i_size_read(inode)) < 0) { >>>> EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); >>>> ret = -EFSCORRUPTED; >>>> @@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle, >>>> raw_inode->i_file_acl_high = >>>> cpu_to_le16(ei->i_file_acl >> 32); >>>> raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); >>>> - if (ei->i_disksize != ext4_isize(raw_inode)) { >>>> + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { >>>> ext4_isize_set(raw_inode, ei->i_disksize); >>>> need_datasync = 1; >>>> } >>>> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c >>>> index 6ad612c..3298fe3 100644 >>>> --- a/fs/ext4/namei.c >>>> +++ b/fs/ext4/namei.c >>>> @@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle, >>>> >>>> static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) >>>> { >>>> - return le32_to_cpu(entry->block) & 0x00ffffff; >>>> + return le32_to_cpu(entry->block) & 0x0fffffff; >>>> } >>> >>> It wouldn't be terrible to add a comment to this function explaining why >>> the block numbers are masked off, which is to allow the high bits of the >>> logical block number to hold a "fullness" fraction so that the kernel >>> could start shrinking directories if many files are deleted. That said, >>> this isn't a shortcoming of this patch, but a suggestion if the patch is >>> being resubmitted for some other reason. >>> >>>> static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) >>>> @@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, >>>> struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); >>>> u32 hash; >>>> >>>> + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); >>>> frame->bh = ext4_read_dirblock(dir, 0, INDEX); >>>> if (IS_ERR(frame->bh)) >>>> return (struct dx_frame *) frame->bh; >>>> @@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, >>>> } >>>> >>>> indirect = root->info.indirect_levels; >>>> - if (indirect > 1) { >>>> - ext4_warning_inode(dir, "Unimplemented hash depth: %#06x", >>>> - root->info.indirect_levels); >>>> + if (indirect >= ext4_dir_htree_level(dir->i_sb)) { >>>> + ext4_warning(dir->i_sb, >>>> + "Directory (ino: %lu) htree depth %#06x exceed" >>>> + "supported value", dir->i_ino, >>>> + ext4_dir_htree_level(dir->i_sb)); >>>> + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { >>>> + ext4_warning(dir->i_sb, "Enable large directory " >>>> + "feature to access it"); >>>> + } >>>> goto fail; >>>> } >>>> >>>> @@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, >>>> >>>> static void dx_release(struct dx_frame *frames) >>>> { >>>> + struct dx_root_info *info; >>>> + int i; >>>> + >>>> if (frames[0].bh == NULL) >>>> return; >>>> >>>> - if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels) >>>> - brelse(frames[1].bh); >>>> - brelse(frames[0].bh); >>>> + info = &((struct dx_root *)frames[0].bh->b_data)->info; >>>> + for (i = 0; i <= info->indirect_levels; i++) { >>>> + if (frames[i].bh == NULL) >>>> + break; >>>> + brelse(frames[i].bh); >>>> + frames[i].bh = NULL; >>>> + } >>>> } >>>> >>>> /* >>>> @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, >>>> { >>>> struct dx_hash_info hinfo; >>>> struct ext4_dir_entry_2 *de; >>>> - struct dx_frame frames[2], *frame; >>>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; >>>> struct inode *dir; >>>> ext4_lblk_t block; >>>> int count = 0; >>>> @@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, >>>> struct ext4_dir_entry_2 **res_dir) >>>> { >>>> struct super_block * sb = dir->i_sb; >>>> - struct dx_frame frames[2], *frame; >>>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; >>>> const struct qstr *d_name = fname->usr_fname; >>>> struct buffer_head *bh; >>>> ext4_lblk_t block; >>>> @@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, >>>> */ >>>> dir->i_mtime = dir->i_ctime = current_time(dir); >>>> ext4_update_dx_flag(dir); >>>> - dir->i_version++; >>>> + inode_inc_iversion(dir); >>>> ext4_mark_inode_dirty(handle, dir); >>>> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); >>>> err = ext4_handle_dirty_dirent_node(handle, dir, bh); >>>> @@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, >>>> { >>>> struct buffer_head *bh2; >>>> struct dx_root *root; >>>> - struct dx_frame frames[2], *frame; >>>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; >>>> struct dx_entry *entries; >>>> struct ext4_dir_entry_2 *de, *de2; >>>> struct ext4_dir_entry_tail *t; >>>> @@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, >>>> static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >>>> struct inode *dir, struct inode *inode) >>>> { >>>> - struct dx_frame frames[2], *frame; >>>> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; >>>> struct dx_entry *entries, *at; >>>> struct buffer_head *bh; >>>> struct super_block *sb = dir->i_sb; >>>> struct ext4_dir_entry_2 *de; >>>> + int restart; >>>> int err; >>>> >>>> +again: >>>> + restart = 0; >>>> frame = dx_probe(fname, dir, NULL, frames); >>>> if (IS_ERR(frame)) >>>> return PTR_ERR(frame); >>>> @@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >>>> if (err != -ENOSPC) >>>> goto cleanup; >>>> >>>> + err = 0; >>>> /* Block full, should compress but for now just split */ >>>> dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", >>>> dx_get_count(entries), dx_get_limit(entries))); >>>> /* Need to split index? */ >>>> if (dx_get_count(entries) == dx_get_limit(entries)) { >>>> ext4_lblk_t newblock; >>>> - unsigned icount = dx_get_count(entries); >>>> - int levels = frame - frames; >>>> + int levels = frame - frames + 1; >>>> + unsigned int icount; >>>> + int add_level = 1; >>>> struct dx_entry *entries2; >>>> struct dx_node *node2; >>>> struct buffer_head *bh2; >>>> >>>> - if (levels && (dx_get_count(frames->entries) == >>>> - dx_get_limit(frames->entries))) { >>>> - ext4_warning_inode(dir, "Directory index full!"); >>>> + while (frame > frames) { >>>> + if (dx_get_count((frame - 1)->entries) < >>>> + dx_get_limit((frame - 1)->entries)) { >>>> + add_level = 0; >>>> + break; >>>> + } >>>> + frame--; /* split higher index block */ >>>> + at = frame->at; >>>> + entries = frame->entries; >>>> + restart = 1; >>>> + } >>>> + if (add_level && levels == ext4_dir_htree_level(sb)) { >>>> + ext4_warning(sb, "Directory (ino: %lu) index full, " >>>> + "reach max htree level :%d", >>>> + dir->i_ino, levels); >>>> + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { >>>> + ext4_warning(sb, "Large directory feature is " >>>> + "not enabled on this " >>>> + "filesystem"); >>>> + } >>>> err = -ENOSPC; >>>> goto cleanup; >>>> } >>>> + icount = dx_get_count(entries); >>>> bh2 = ext4_append(handle, dir, &newblock); >>>> if (IS_ERR(bh2)) { >>>> err = PTR_ERR(bh2); >>>> @@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >>>> err = ext4_journal_get_write_access(handle, frame->bh); >>>> if (err) >>>> goto journal_error; >>>> - if (levels) { >>>> + if (!add_level) { >>>> unsigned icount1 = icount/2, icount2 = icount - icount1; >>>> unsigned hash2 = dx_get_hash(entries + icount1); >>>> dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", >>>> @@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >>>> >>>> BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ >>>> err = ext4_journal_get_write_access(handle, >>>> - frames[0].bh); >>>> + (frame - 1)->bh); >>>> if (err) >>>> goto journal_error; >>>> >>>> @@ -2269,17 +2306,23 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >>>> frame->entries = entries = entries2; >>>> swap(frame->bh, bh2); >>>> } >>>> - dx_insert_block(frames + 0, hash2, newblock); >>>> - dxtrace(dx_show_index("node", frames[1].entries)); >>>> + dx_insert_block((frame - 1), hash2, newblock); >>>> + dxtrace(dx_show_index("node", frame->entries)); >>>> dxtrace(dx_show_index("node", >>>> ((struct dx_node *) bh2->b_data)->entries)); >>>> err = ext4_handle_dirty_dx_node(handle, dir, bh2); >>>> if (err) >>>> goto journal_error; >>>> brelse (bh2); >>>> + ext4_handle_dirty_metadata(handle, dir, >>>> + (frame - 1)->bh); >>>> + if (restart) { >>>> + ext4_handle_dirty_metadata(handle, dir, >>>> + frame->bh); >>>> + goto cleanup; >>>> + } >>>> } else { >>>> - dxtrace(printk(KERN_DEBUG >>>> - "Creating second level index...\n")); >>>> + struct dx_root *dxroot; >>>> memcpy((char *) entries2, (char *) entries, >>>> icount * sizeof(struct dx_entry)); >>>> dx_set_limit(entries2, dx_node_limit(dir)); >>>> @@ -2287,19 +2330,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >>>> /* Set up root */ >>>> dx_set_count(entries, 1); >>>> dx_set_block(entries + 0, newblock); >>>> - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; >>>> - >>>> - /* Add new access path frame */ >>>> - frame = frames + 1; >>>> - frame->at = at = at - entries + entries2; >>>> - frame->entries = entries = entries2; >>>> - frame->bh = bh2; >>>> - err = ext4_journal_get_write_access(handle, >>>> - frame->bh); >>>> - if (err) >>>> - goto journal_error; >>>> + dxroot = (struct dx_root *)frames[0].bh->b_data; >>>> + dxroot->info.indirect_levels += 1; >>>> + dxtrace(printk(KERN_DEBUG >>>> + "Creating %d level index...\n", >>>> + info->indirect_levels)); >>>> + ext4_handle_dirty_metadata(handle, dir, frame->bh); >>>> + ext4_handle_dirty_metadata(handle, dir, bh2); >>>> + brelse(bh2); >>>> + restart = 1; >>>> + goto cleanup; >>>> } >>>> - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); >>>> if (err) { >>>> ext4_std_error(inode->i_sb, err); >>>> goto cleanup; >>>> @@ -2318,6 +2359,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >>>> cleanup: >>>> brelse(bh); >>>> dx_release(frames); >>>> + /* @restart is true means htree-path has been changed, we need to >>>> + * repeat dx_probe() to find out valid htree-path >>>> + */ >>>> + if (restart && err == 0) >>>> + goto again; >>>> return err; >>>> } >>>> >>>> @@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle, >>>> blocksize); >>>> else >>>> de->inode = 0; >>>> - dir->i_version++; >>>> + inode_inc_iversion(dir); >>>> return 0; >>>> } >>>> i += ext4_rec_len_from_disk(de->rec_len, blocksize); >>>> -- >>>> 1.8.3.1 >>>> >>> >>> >>> Cheers, Andreas >>> >>> >>> >>> >>> >> > > > Cheers, Andreas > > > > >
On Sat, Mar 18, 2017 at 11:16:26AM +0300, Alexey Lyashkov wrote: > Andreas, > > it not about a feature flag. It’s about a situation in whole. > Yes, we may increase a directory size, but it open a number a large problems. > 1) readdir. It tries to read all entries in memory before send to > the user. currently it may eats 20*10^6 * 256 so several gigs, so > increasing it size may produce a problems for a system. That's not true. We normally only read in one block a time. If there is a hash collision, then we may need to insert into the rbtree in a subsequent block's worth of dentries to make sure we have all of the directory entries corresponding to a particular hash value. I think you misunderstood the code. > 2) inode allocation. Current code tries to allocate an inode as near as possible to the directory inode, but one GD may hold 32k entries only, so increase a directory size will use up 1k GD for now and more than it, after it. It increase a seek time with file allocation. It was i mean when say - «dramatically decrease a file creation rate». But there are also only 32k blocks in a group descriptor, and we try to keep the blocks allocated close to the inode. So if you are using a huge directory, and you are using a storage device with a significant seek penalty, then yes, no matter what as the directory grows, the time to iterate over all of the files does grow. But there is more to life than microbenchmarks which creates huge numbers of zero length files! If we assume that the files are going to need to contain _data_, and the data blocks should be close to the inodes, then there are going to be some performance impacts no matter what. > 3) current limit with 4G inodes - currently 32-128 directories may eats a full inode number space. From it perspective large dir don’t need to be used. I can imagine a new feature flag which defines the use a 64-bit inode number, but that's more for people who are creating a file system that takes advantage of 64-bit block numbers, and they are intending on using all of that space to store small (< 4k or < 8k) files. And it's also true that there are huge advantges to using a multi-level directory hierarchy --- e.g.: .git/objects/03/08e42105258d4e53ffeb81ffb2a4b2480bb8b8 or even .git/objects/03/08/e42105258d4e53ffeb81ffb2a4b2480bb8b8 instead of: .git/objects/0308e42105258d4e53ffeb81ffb2a4b2480bb8b8 but that's an application level question. If for some reason some silly application programmer wants to have a single gargantuan directory, if the patches to support it are fairly simple, even if someone is going to give us patches to do something more general, burning an extra feature flag doesn't seem like the most terrible thing in the world. As for the other optimizations --- things like allowing parallel directory modifications, or being able to shrink empty directory blocks or shorten the tree are all improvements we can make without impacting the on-disk format. So they aren't an argument for halting the submission of the new on-disk format, no? - Ted
> 18 марта 2017 г., в 19:29, Theodore Ts'o <tytso@mit.edu> написал(а): > > On Sat, Mar 18, 2017 at 11:16:26AM +0300, Alexey Lyashkov wrote: >> Andreas, >> >> it not about a feature flag. It’s about a situation in whole. >> Yes, we may increase a directory size, but it open a number a large problems. > >> 1) readdir. It tries to read all entries in memory before send to >> the user. currently it may eats 20*10^6 * 256 so several gigs, so >> increasing it size may produce a problems for a system. > > That's not true. We normally only read in one block a time. If there > is a hash collision, then we may need to insert into the rbtree in a > subsequent block's worth of dentries to make sure we have all of the > directory entries corresponding to a particular hash value. I think > you misunderstood the code. As i see it not about hash collisions, but about merging a several blocks into same hash range on up level hash entry. so if we have a large hash range originally assigned to the single block, all that range will read at memory at single step. With «aged» directory when hash blocks used already - it’s easy to hit. > >> 2) inode allocation. Current code tries to allocate an inode as near as possible to the directory inode, but one GD may hold 32k entries only, so increase a directory size will use up 1k GD for now and more than it, after it. It increase a seek time with file allocation. It was i mean when say - «dramatically decrease a file creation rate». > > But there are also only 32k blocks in a group descriptor, and we try > to keep the blocks allocated close to the inode. with bigalloc feature it’s not a 32k blocks. but 32k clusters with 1M cluster size(as example), it very large space. > So if you are using > a huge directory, and you are using a storage device with a > significant seek penalty, then yes, no matter what as the directory > grows, the time to iterate over all of the files does grow. But there > is more to life than microbenchmarks which creates huge numbers of zero > length files! If we assume that the files are going to need to > contain _data_, and the data blocks should be close to the inodes, > then there are going to be some performance impacts no matter what. > Yes, i expect to have some seek penalty. But may testing say it too huge now. directory creation rate started with 80k create/s have dropped to the 20k-30k create/s with hash tree extend to the level 3. Same testing with hard links same create rate dropped slightly. >> 3) current limit with 4G inodes - currently 32-128 directories may eats a full inode number space. From it perspective large dir don’t need to be used. > > I can imagine a new feature flag which defines the use a 64-bit inode > number, but that's more for people who are creating a file system that > takes advantage of 64-bit block numbers, and they are intending on > using all of that space to store small (< 4k or < 8k) files. > > And it's also true that there are huge advantges to using a > multi-level directory hierarchy --- e.g.: > > .git/objects/03/08e42105258d4e53ffeb81ffb2a4b2480bb8b8 > > or even > > .git/objects/03/08/e42105258d4e53ffeb81ffb2a4b2480bb8b8 > > instead of: > > .git/objects/0308e42105258d4e53ffeb81ffb2a4b2480bb8b8 > > but that's an application level question. If for some reason some > silly application programmer wants to have a single gargantuan > directory, if the patches to support it are fairly simple, even if > someone is going to give us patches to do something more general, > burning an extra feature flag doesn't seem like the most terrible > thing in the world. From other side - application don’t expect to have very slow directory and have access with some constant or near or it speed. > > As for the other optimizations --- things like allowing parallel > directory modifications, or being able to shrink empty directory > blocks or shorten the tree are all improvements we can make without > impacting the on-disk format. So they aren't an argument for halting > the submission of the new on-disk format, no? > It’s argument about using this feature. Yes, we can land it, but it decrease an expected speed in some cases. Alexey
On Sat, Mar 18, 2017 at 08:17:55PM +0300, Alexey Lyashkov wrote: > > > > That's not true. We normally only read in one block a time. If there > > is a hash collision, then we may need to insert into the rbtree in a > > subsequent block's worth of dentries to make sure we have all of the > > directory entries corresponding to a particular hash value. I think > > you misunderstood the code. > > As i see it not about hash collisions, but about merging a several > blocks into same hash range on up level hash entry. so if we have a > large hash range originally assigned to the single block, all that > range will read at memory at single step. With «aged» directory > when hash blocks used already - it’s easy to hit. If you look at ext4_htree_fill_tree(), we are only iterating over the leaf blocks. We are using a 31-bit hash, where the low-order bit is one if there has been a collision. In that case, we need to read the next block to make sure all of the directory entries which have the same 31-bit hash are in the rbtree. So *even* if the directory is so badly fragmeneted that there is only one directory entry per block, we will eventually find a block where the last (only) directory entry has a different directory entry from the current hash value, at which point we can stop. And I'll note that this is a problem that can be solved without changing the on-disk format, but simply adding code so we can merge adjacent directory entry blocks. The main reason why it hasn't been done is (a) no one has sent me patches, and (b) I haven't seen workloads where this has been anything other than a theoretical / academic concern. You seem very passionate about this. Is this a problem you've personally seen? If so, can you give me more details about your use case, and how you've been running into this issue? Instead of just arguing about it from a largely theoretical perspective? > Yes, i expect to have some seek penalty. But may testing say it too huge now. > directory creation rate started with 80k create/s have dropped to the 20k-30k create/s with hash tree extend to the level 3. > Same testing with hard links same create rate dropped slightly. So this sounds like it's all about the seek penalty of the _data_ blocks. If you use hard links the creation rate only dropped a little, am I understanding you corretly? (Sorry, your English is a little fracturered so I'm having trouble parsing the meaning out of your sentences.) So what do you think the creation rate _should_ be? And where do you think the time is going to, if it's not the fact that we have to place the data blocks further and further from the directory? And more importantly, what's your proposal for how to "fix" this? > > As for the other optimizations --- things like allowing parallel > > directory modifications, or being able to shrink empty directory > > blocks or shorten the tree are all improvements we can make without > > impacting the on-disk format. So they aren't an argument for halting > > the submission of the new on-disk format, no? > > > It’s argument about using this feature. Yes, we can land it, but it decrease an expected speed in some cases. But there are cases where today, the workload would simply fail with ENOSPC when the directory couldn't grow any farther. So in those cases _maybe_ there is something we could do differently that might make things faster, but you have yet to convince me that the fundamental fault is one that can only be cured by an on-disk format change. (And if you believe this is to be true, please enlighten us on how we can make the on-disk format better!) Cheers, - Ted
sorry for english.. > 19 марта 2017 г., в 3:39, Theodore Ts'o <tytso@mit.edu> написал(а): > > On Sat, Mar 18, 2017 at 08:17:55PM +0300, Alexey Lyashkov wrote: >>> >>> That's not true. We normally only read in one block a time. If there >>> is a hash collision, then we may need to insert into the rbtree in a >>> subsequent block's worth of dentries to make sure we have all of the >>> directory entries corresponding to a particular hash value. I think >>> you misunderstood the code. >> >> As i see it not about hash collisions, but about merging a several >> blocks into same hash range on up level hash entry. so if we have a >> large hash range originally assigned to the single block, all that >> range will read at memory at single step. With «aged» directory >> when hash blocks used already - it’s easy to hit. > > If you look at ext4_htree_fill_tree(), we are only iterating over the > leaf blocks. We are using a 31-bit hash, where the low-order bit is > one if there has been a collision. In that case, we need to read the > next block to make sure all of the directory entries which have the > same 31-bit hash are in the rbtree. looks we say about same but with different words. based on dx code, up level hash block have a records - {hash1, block1} {hash2, block1}… so any records with hash range {hash1, hash2} will live on block1. right ? so question how much {hash1, hash2} distance may be, looks you name it as "hash collision". > You seem very passionate about this. Is this a problem you've > personally seen? If so, can you give me more details about your use > case, and how you've been running into this issue? Instead of just > arguing about it from a largely theoretical perspective? > It problem was seen with Lustre MDT code after large number create/unlinks. but it seen only few times. Other hits isn’t conformed. >> Yes, i expect to have some seek penalty. But may testing say it too huge now. >> directory creation rate started with 80k create/s have dropped to the 20k-30k create/s with hash tree extend to the level 3. >> Same testing with hard links same create rate dropped slightly. > > So this sounds like it's all about the seek penalty of the _data_ > blocks. no. > If you use hard links the creation rate only dropped a > little, am I understanding you corretly? yes and no. hard link create rate dropped a little, but open()+close tests dropped a large. No writes, no data blocks, just inode allocation. > (Sorry, your English is a > little fracturered so I'm having trouble parsing the meaning out of > your sentences.) it’s my bad :( > > So what do you think the creation rate _should_ be? And where do you > think the time is going to, if it's not the fact that we have to place > the data blocks further and further from the directory? And more > importantly, what's your proposal for how to "fix" this? > >>> As for the other optimizations --- things like allowing parallel >>> directory modifications, or being able to shrink empty directory >>> blocks or shorten the tree are all improvements we can make without >>> impacting the on-disk format. So they aren't an argument for halting >>> the submission of the new on-disk format, no? >>> >> It’s argument about using this feature. Yes, we can land it, but it decrease an expected speed in some cases. > > But there are cases where today, the workload would simply fail with > ENOSPC when the directory couldn't grow any farther. So in those > cases _maybe_ there is something we could do differently that might > make things faster, but you have yet to convince me that the > fundamental fault is one that can only be cured by an on-disk format > change. (And if you believe this is to be true, please enlighten us > on how we can make the on-disk format better!) > > Cheers, > > - Ted
On Mar 18, 2017, at 10:29 AM, Theodore Ts'o <tytso@mit.edu> wrote: > > On Sat, Mar 18, 2017 at 11:16:26AM +0300, Alexey Lyashkov wrote: >> Andreas, >> >> it not about a feature flag. It’s about a situation in whole. >> Yes, we may increase a directory size, but it open a number a large problems. > >> 1) readdir. It tries to read all entries in memory before send to >> the user. currently it may eats 20*10^6 * 256 so several gigs, so >> increasing it size may produce a problems for a system. > > That's not true. We normally only read in one block a time. If there > is a hash collision, then we may need to insert into the rbtree in a > subsequent block's worth of dentries to make sure we have all of the > directory entries corresponding to a particular hash value. I think > you misunderstood the code. > >> 2) inode allocation. Current code tries to allocate an inode as near as possible to the directory inode, but one GD may hold 32k entries only, so increase a directory size will use up 1k GD for now and more than it, after it. It increase a seek time with file allocation. It was i mean when say - «dramatically decrease a file creation rate». > > But there are also only 32k blocks in a group descriptor, and we try > to keep the blocks allocated close to the inode. So if you are using > a huge directory, and you are using a storage device with a > significant seek penalty, then yes, no matter what as the directory > grows, the time to iterate over all of the files does grow. But there > is more to life than microbenchmarks which creates huge numbers of zero > length files! If we assume that the files are going to need to > contain _data_, and the data blocks should be close to the inodes, > then there are going to be some performance impacts no matter what. Actually, on a Lustre MDT there _are_ only zero-length files, since all of the data is stored in another filesystem. Fortunately, the parent directory stores the last group successfully used for allocation (i_alloc_group) so that new inode allocation doesn't have to scan the whole filesystem each time from the parent's group. >> 3) current limit with 4G inodes - currently 32-128 directories may eats a full inode number space. From it perspective large dir don’t need to be used. > > I can imagine a new feature flag which defines the use a 64-bit inode > number, but that's more for people who are creating a file system that > takes advantage of 64-bit block numbers, and they are intending on > using all of that space to store small (< 4k or < 8k) files. The 4-billion inode limit is somewhat independent of large directories. That said, the DIRDATA feature that is used for Lustre is also designed to allow storing the high 32 bits of the inode number in the directory. This would allow compatible upgrade of a directory to storing both 32-bit and 64-bit inode numbers without the need for wholescale conversion of directories, or having space for 64-bit inode numbers even if most of the inodes are only 32-bit values. Cheers, Andreas
On Mar 18, 2017, at 6:39 PM, Theodore Ts'o <tytso@mit.edu> wrote: > > On Sat, Mar 18, 2017 at 08:17:55PM +0300, Alexey Lyashkov wrote: >>>> 1) readdir. It tries to read all entries in memory before send to >>>> the user. currently it may eats 20*10^6 * 256 so several gigs, so >>>> increasing it size may produce a problems for a system. >>> >>> That's not true. We normally only read in one block a time. If there >>> is a hash collision, then we may need to insert into the rbtree in a >>> subsequent block's worth of dentries to make sure we have all of the >>> directory entries corresponding to a particular hash value. I think >>> you misunderstood the code. >> >> As i see it not about hash collisions, but about merging a several >> blocks into same hash range on up level hash entry. so if we have a >> large hash range originally assigned to the single block, all that >> range will read at memory at single step. With «aged» directory >> when hash blocks used already - it’s easy to hit. > > If you look at ext4_htree_fill_tree(), we are only iterating over the > leaf blocks. We are using a 31-bit hash, where the low-order bit is > one if there has been a collision. In that case, we need to read the > next block to make sure all of the directory entries which have the > same 31-bit hash are in the rbtree. I guess the main concern here is that insertion into an htree directory is essentially random based on the hash, so for very large directories the whole directory (htree and leaf blocks) needs to fit into RAM or the performance will suffer since every leaf block will need to be read from disk for every create/lookup/delete. That said, the maximum size of a 2-level htree directory is about 1GB, and servers have 128 or 256GB of RAM these days, so I don't think this will be a huge problem. We also have a tunable that can limit the maximum directory size if this becomes a problem. > And I'll note that this is a problem that can be solved without > changing the on-disk format, but simply adding code so we can merge > adjacent directory entry blocks. The main reason why it hasn't been > done is (a) no one has sent me patches, and (b) I haven't seen > workloads where this has been anything other than a theoretical / > academic concern. > > You seem very passionate about this. Is this a problem you've > personally seen? If so, can you give me more details about your use > case, and how you've been running into this issue? Instead of just > arguing about it from a largely theoretical perspective? We have seen large directories at the htree limit unable to add new entries because the htree/leaf blocks become fragmented from repeated create/delete cycles. I agree that handling directory shrinking would probably solve that problem, since the htree and leaf blocks would be compacted during deletion and then the htree would be able to split the leaf blocks in the right location for the new entries. Cheers, Andreas >> Yes, i expect to have some seek penalty. But may testing say it too huge >> now. directory creation rate started with 80k create/s have dropped >> to the 20k-30k create/s with hash tree extend to the level 3. Same >> testing with hard links same create rate dropped slightly. > > So this sounds like it's all about the seek penalty of the _data_ > blocks. If you use hard links the creation rate only dropped a > little, am I understanding you correctly? (Sorry, your English is a > little fracturered so I'm having trouble parsing the meaning out of > your sentences.) > > So what do you think the creation rate _should_ be? And where do you > think the time is going to, if it's not the fact that we have to place > the data blocks further and further from the directory? And more > importantly, what's your proposal for how to "fix" this? > >>> As for the other optimizations --- things like allowing parallel >>> directory modifications, or being able to shrink empty directory >>> blocks or shorten the tree are all improvements we can make without >>> impacting the on-disk format. So they aren't an argument for halting >>> the submission of the new on-disk format, no? >>> >> It’s argument about using this feature. Yes, we can land it, but it decrease an expected speed in some cases. > > But there are cases where today, the workload would simply fail with > ENOSPC when the directory couldn't grow any farther. So in those > cases _maybe_ there is something we could do differently that might > make things faster, but you have yet to convince me that the > fundamental fault is one that can only be cured by an on-disk format > change. (And if you believe this is to be true, please enlighten us > on how we can make the on-disk format better!) > > Cheers, > > - Ted Cheers, Andreas
On Thu, Mar 16, 2017 at 05:51:17AM -0400, Artem Blagodarenko wrote: > From: Artem Blagodarenko <artem.blagodarenko@gmail.com> > > This INCOMPAT_LARGEDIR feature allows larger directories to be created > in ldiskfs, both with directory sizes over 2GB and and a maximum htree > depth of 3 instead of the current limit of 2. These features are needed > in order to exceed the current limit of approximately 10M entries in a > single directory. > > Signed-off-by: Yang Sheng <yang.sheng@intel.com> > Signed-off-by: Artem Blagodarenko <artem.blagodarenko@seagate.com> Thanks, applied. - Ted
On Sat, Apr 29, 2017 at 08:59:53PM -0400, Theodore Ts'o wrote: > On Thu, Mar 16, 2017 at 05:51:17AM -0400, Artem Blagodarenko wrote: > > From: Artem Blagodarenko <artem.blagodarenko@gmail.com> > > > > This INCOMPAT_LARGEDIR feature allows larger directories to be created > > in ldiskfs, both with directory sizes over 2GB and and a maximum htree > > depth of 3 instead of the current limit of 2. These features are needed > > in order to exceed the current limit of approximately 10M entries in a > > single directory. > > > > Signed-off-by: Yang Sheng <yang.sheng@intel.com> > > Signed-off-by: Artem Blagodarenko <artem.blagodarenko@seagate.com> > > Thanks, applied. > > - Ted FYI, this patch is causing a problem when creating many files in a directory (without the largedir feature enabled). I haven't looked into it but maybe it will ring a bell for someone. seq -f "abcdefghijklmnopqrstuvwxyz012345%.0f" 100000 | xargs touch [ 42.726480] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum [ 42.729472] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum [ 42.731689] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum [ 42.734303] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum [ 42.736383] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum [ 42.739133] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum [ 42.741307] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum [ 42.743963] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum [ 42.745998] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum ...
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 01d52b9..0bbbd9b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ EXT4_FEATURE_INCOMPAT_ENCRYPT | \ - EXT4_FEATURE_INCOMPAT_CSUM_SEED) + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ @@ -2125,6 +2126,16 @@ struct dir_private_info { */ #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) +/* htree levels for ext4 */ +#define EXT4_HTREE_LEVEL_COMPAT 2 +#define EXT4_HTREE_LEVEL 3 + +static inline int ext4_dir_htree_level(struct super_block *sb) +{ + return ext4_has_feature_largedir(sb) ? + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; +} + /* * Timeout and state flag for lazy initialization inode thread. */ @@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); } -static inline loff_t ext4_isize(struct ext4_inode *raw_inode) +static inline loff_t ext4_isize(struct super_block *sb, + struct ext4_inode *raw_inode) { - if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) + if (ext4_has_feature_largedir(sb) || + S_ISREG(le16_to_cpu(raw_inode->i_mode))) return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | le32_to_cpu(raw_inode->i_size_lo); - else - return (loff_t) le32_to_cpu(raw_inode->i_size_lo); + + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); } static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f622d4a..5787f3d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (ext4_has_feature_64bit(sb)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; - inode->i_size = ext4_isize(raw_inode); + inode->i_size = ext4_isize(sb, raw_inode); if ((size = i_size_read(inode)) < 0) { EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); ret = -EFSCORRUPTED; @@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); - if (ei->i_disksize != ext4_isize(raw_inode)) { + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { ext4_isize_set(raw_inode, ei->i_disksize); need_datasync = 1; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6ad612c..3298fe3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle, static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) { - return le32_to_cpu(entry->block) & 0x00ffffff; + return le32_to_cpu(entry->block) & 0x0fffffff; } static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) @@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); u32 hash; + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); frame->bh = ext4_read_dirblock(dir, 0, INDEX); if (IS_ERR(frame->bh)) return (struct dx_frame *) frame->bh; @@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, } indirect = root->info.indirect_levels; - if (indirect > 1) { - ext4_warning_inode(dir, "Unimplemented hash depth: %#06x", - root->info.indirect_levels); + if (indirect >= ext4_dir_htree_level(dir->i_sb)) { + ext4_warning(dir->i_sb, + "Directory (ino: %lu) htree depth %#06x exceed" + "supported value", dir->i_ino, + ext4_dir_htree_level(dir->i_sb)); + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { + ext4_warning(dir->i_sb, "Enable large directory " + "feature to access it"); + } goto fail; } @@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, static void dx_release(struct dx_frame *frames) { + struct dx_root_info *info; + int i; + if (frames[0].bh == NULL) return; - if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels) - brelse(frames[1].bh); - brelse(frames[0].bh); + info = &((struct dx_root *)frames[0].bh->b_data)->info; + for (i = 0; i <= info->indirect_levels; i++) { + if (frames[i].bh == NULL) + break; + brelse(frames[i].bh); + frames[i].bh = NULL; + } } /* @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, { struct dx_hash_info hinfo; struct ext4_dir_entry_2 *de; - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct inode *dir; ext4_lblk_t block; int count = 0; @@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_dir_entry_2 **res_dir) { struct super_block * sb = dir->i_sb; - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; const struct qstr *d_name = fname->usr_fname; struct buffer_head *bh; ext4_lblk_t block; @@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, */ dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); - dir->i_version++; + inode_inc_iversion(dir); ext4_mark_inode_dirty(handle, dir); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirent_node(handle, dir, bh); @@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, { struct buffer_head *bh2; struct dx_root *root; - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries; struct ext4_dir_entry_2 *de, *de2; struct ext4_dir_entry_tail *t; @@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries, *at; struct buffer_head *bh; struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; + int restart; int err; +again: + restart = 0; frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return PTR_ERR(frame); @@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, if (err != -ENOSPC) goto cleanup; + err = 0; /* Block full, should compress but for now just split */ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", dx_get_count(entries), dx_get_limit(entries))); /* Need to split index? */ if (dx_get_count(entries) == dx_get_limit(entries)) { ext4_lblk_t newblock; - unsigned icount = dx_get_count(entries); - int levels = frame - frames; + int levels = frame - frames + 1; + unsigned int icount; + int add_level = 1; struct dx_entry *entries2; struct dx_node *node2; struct buffer_head *bh2; - if (levels && (dx_get_count(frames->entries) == - dx_get_limit(frames->entries))) { - ext4_warning_inode(dir, "Directory index full!"); + while (frame > frames) { + if (dx_get_count((frame - 1)->entries) < + dx_get_limit((frame - 1)->entries)) { + add_level = 0; + break; + } + frame--; /* split higher index block */ + at = frame->at; + entries = frame->entries; + restart = 1; + } + if (add_level && levels == ext4_dir_htree_level(sb)) { + ext4_warning(sb, "Directory (ino: %lu) index full, " + "reach max htree level :%d", + dir->i_ino, levels); + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { + ext4_warning(sb, "Large directory feature is " + "not enabled on this " + "filesystem"); + } err = -ENOSPC; goto cleanup; } + icount = dx_get_count(entries); bh2 = ext4_append(handle, dir, &newblock); if (IS_ERR(bh2)) { err = PTR_ERR(bh2); @@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, err = ext4_journal_get_write_access(handle, frame->bh); if (err) goto journal_error; - if (levels) { + if (!add_level) { unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned hash2 = dx_get_hash(entries + icount1); dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", @@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ err = ext4_journal_get_write_access(handle, - frames[0].bh); + (frame - 1)->bh); if (err) goto journal_error; @@ -2269,17 +2306,23 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, frame->entries = entries = entries2; swap(frame->bh, bh2); } - dx_insert_block(frames + 0, hash2, newblock); - dxtrace(dx_show_index("node", frames[1].entries)); + dx_insert_block((frame - 1), hash2, newblock); + dxtrace(dx_show_index("node", frame->entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); err = ext4_handle_dirty_dx_node(handle, dir, bh2); if (err) goto journal_error; brelse (bh2); + ext4_handle_dirty_metadata(handle, dir, + (frame - 1)->bh); + if (restart) { + ext4_handle_dirty_metadata(handle, dir, + frame->bh); + goto cleanup; + } } else { - dxtrace(printk(KERN_DEBUG - "Creating second level index...\n")); + struct dx_root *dxroot; memcpy((char *) entries2, (char *) entries, icount * sizeof(struct dx_entry)); dx_set_limit(entries2, dx_node_limit(dir)); @@ -2287,19 +2330,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, /* Set up root */ dx_set_count(entries, 1); dx_set_block(entries + 0, newblock); - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; - - /* Add new access path frame */ - frame = frames + 1; - frame->at = at = at - entries + entries2; - frame->entries = entries = entries2; - frame->bh = bh2; - err = ext4_journal_get_write_access(handle, - frame->bh); - if (err) - goto journal_error; + dxroot = (struct dx_root *)frames[0].bh->b_data; + dxroot->info.indirect_levels += 1; + dxtrace(printk(KERN_DEBUG + "Creating %d level index...\n", + info->indirect_levels)); + ext4_handle_dirty_metadata(handle, dir, frame->bh); + ext4_handle_dirty_metadata(handle, dir, bh2); + brelse(bh2); + restart = 1; + goto cleanup; } - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); if (err) { ext4_std_error(inode->i_sb, err); goto cleanup; @@ -2318,6 +2359,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, cleanup: brelse(bh); dx_release(frames); + /* @restart is true means htree-path has been changed, we need to + * repeat dx_probe() to find out valid htree-path + */ + if (restart && err == 0) + goto again; return err; } @@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle, blocksize); else de->inode = 0; - dir->i_version++; + inode_inc_iversion(dir); return 0; } i += ext4_rec_len_from_disk(de->rec_len, blocksize);