Add largedir feature

Message ID 1489657877-34478-1-git-send-email-artem.blagodarenko@seagate.com
State New
Headers show

Commit Message

Artem Blagodarenko March 16, 2017, 9:51 a.m.
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>

This INCOMPAT_LARGEDIR feature allows larger directories to be created
in ldiskfs, both with directory sizes over 2GB and and a maximum htree
depth of 3 instead of the current limit of 2. These features are needed
in order to exceed the current limit of approximately 10M entries in a
single directory.

Signed-off-by: Yang Sheng <yang.sheng@intel.com>
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@seagate.com>
---
 fs/ext4/ext4.h  |  23 ++++++++---
 fs/ext4/inode.c |   4 +-
 fs/ext4/namei.c | 118 +++++++++++++++++++++++++++++++++++++++-----------------
 3 files changed, 102 insertions(+), 43 deletions(-)

Comments

Andreas Dilger March 16, 2017, 9:44 p.m. | #1
On Mar 16, 2017, at 3:51 AM, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote:
> 
> From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
> 
> This INCOMPAT_LARGEDIR feature allows larger directories to be created
> in ldiskfs, both with directory sizes over 2GB and and a maximum htree
> depth of 3 instead of the current limit of 2. These features are needed
> in order to exceed the current limit of approximately 10M entries in a
> single directory.

Artem,
many thanks for updating and submitting this, and creating the e2fsck patches.

Ted,
can you please add the original author for the largedir ext4 patch when
landing this patch: Liang Zhen <liang.zhen@intel.com>

and these tags which contain more background information on this feature:

Lustre-change: https://review.hpdd.intel.com/375
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-50

> Signed-off-by: Yang Sheng <yang.sheng@intel.com>
> Signed-off-by: Artem Blagodarenko <artem.blagodarenko@seagate.com>

Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>

> ---
> fs/ext4/ext4.h  |  23 ++++++++---
> fs/ext4/inode.c |   4 +-
> fs/ext4/namei.c | 118 +++++++++++++++++++++++++++++++++++++++-----------------
> 3 files changed, 102 insertions(+), 43 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 01d52b9..0bbbd9b 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
> 					 EXT4_FEATURE_INCOMPAT_MMP | \
> 					 EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
> 					 EXT4_FEATURE_INCOMPAT_ENCRYPT | \
> -					 EXT4_FEATURE_INCOMPAT_CSUM_SEED)
> +					 EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
> +					 EXT4_FEATURE_INCOMPAT_LARGEDIR)
> #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
> 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
> 					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
> @@ -2125,6 +2126,16 @@ struct dir_private_info {
>  */
> #define ERR_BAD_DX_DIR	(-(MAX_ERRNO - 1))
> 
> +/* htree levels for ext4 */
> +#define	EXT4_HTREE_LEVEL_COMPAT	2
> +#define	EXT4_HTREE_LEVEL	3
> +
> +static inline int ext4_dir_htree_level(struct super_block *sb)
> +{
> +	return ext4_has_feature_largedir(sb) ?
> +		EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
> +}
> +
> /*
>  * Timeout and state flag for lazy initialization inode thread.
>  */
> @@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
> 	es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
> }
> 
> -static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
> +static inline loff_t ext4_isize(struct super_block *sb,
> +				struct ext4_inode *raw_inode)
> {
> -	if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
> +	if (ext4_has_feature_largedir(sb) ||
> +	    S_ISREG(le16_to_cpu(raw_inode->i_mode)))
> 		return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
> 			le32_to_cpu(raw_inode->i_size_lo);
> -	else
> -		return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
> +
> +	return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
> }
> 
> static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index f622d4a..5787f3d 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
> 	if (ext4_has_feature_64bit(sb))
> 		ei->i_file_acl |=
> 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
> -	inode->i_size = ext4_isize(raw_inode);
> +	inode->i_size = ext4_isize(sb, raw_inode);
> 	if ((size = i_size_read(inode)) < 0) {
> 		EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
> 		ret = -EFSCORRUPTED;
> @@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle,
> 		raw_inode->i_file_acl_high =
> 			cpu_to_le16(ei->i_file_acl >> 32);
> 	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
> -	if (ei->i_disksize != ext4_isize(raw_inode)) {
> +	if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
> 		ext4_isize_set(raw_inode, ei->i_disksize);
> 		need_datasync = 1;
> 	}
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 6ad612c..3298fe3 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle,
> 
> static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
> {
> -	return le32_to_cpu(entry->block) & 0x00ffffff;
> +	return le32_to_cpu(entry->block) & 0x0fffffff;
> }

It wouldn't be terrible to add a comment to this function explaining why
the block numbers are masked off, which is to allow the high bits of the
logical block number to hold a "fullness" fraction so that the kernel
could start shrinking directories if many files are deleted.  That said,
this isn't a shortcoming of this patch, but a suggestion if the patch is
being resubmitted for some other reason.

> static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
> @@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
> 	struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
> 	u32 hash;
> 
> +	memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
> 	frame->bh = ext4_read_dirblock(dir, 0, INDEX);
> 	if (IS_ERR(frame->bh))
> 		return (struct dx_frame *) frame->bh;
> @@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
> 	}
> 
> 	indirect = root->info.indirect_levels;
> -	if (indirect > 1) {
> -		ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
> -				   root->info.indirect_levels);
> +	if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
> +		ext4_warning(dir->i_sb,
> +			     "Directory (ino: %lu) htree depth %#06x exceed"
> +			     "supported value", dir->i_ino,
> +			     ext4_dir_htree_level(dir->i_sb));
> +		if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
> +			ext4_warning(dir->i_sb, "Enable large directory "
> +						"feature to access it");
> +		}
> 		goto fail;
> 	}
> 
> @@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
> 
> static void dx_release(struct dx_frame *frames)
> {
> +	struct dx_root_info *info;
> +	int i;
> +
> 	if (frames[0].bh == NULL)
> 		return;
> 
> -	if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
> -		brelse(frames[1].bh);
> -	brelse(frames[0].bh);
> +	info = &((struct dx_root *)frames[0].bh->b_data)->info;
> +	for (i = 0; i <= info->indirect_levels; i++) {
> +		if (frames[i].bh == NULL)
> +			break;
> +		brelse(frames[i].bh);
> +		frames[i].bh = NULL;
> +	}
> }
> 
> /*
> @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
> {
> 	struct dx_hash_info hinfo;
> 	struct ext4_dir_entry_2 *de;
> -	struct dx_frame frames[2], *frame;
> +	struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
> 	struct inode *dir;
> 	ext4_lblk_t block;
> 	int count = 0;
> @@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
> 			struct ext4_dir_entry_2 **res_dir)
> {
> 	struct super_block * sb = dir->i_sb;
> -	struct dx_frame frames[2], *frame;
> +	struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
> 	const struct qstr *d_name = fname->usr_fname;
> 	struct buffer_head *bh;
> 	ext4_lblk_t block;
> @@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
> 	 */
> 	dir->i_mtime = dir->i_ctime = current_time(dir);
> 	ext4_update_dx_flag(dir);
> -	dir->i_version++;
> +	inode_inc_iversion(dir);
> 	ext4_mark_inode_dirty(handle, dir);
> 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
> 	err = ext4_handle_dirty_dirent_node(handle, dir, bh);
> @@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
> {
> 	struct buffer_head *bh2;
> 	struct dx_root	*root;
> -	struct dx_frame	frames[2], *frame;
> +	struct dx_frame	frames[EXT4_HTREE_LEVEL], *frame;
> 	struct dx_entry *entries;
> 	struct ext4_dir_entry_2	*de, *de2;
> 	struct ext4_dir_entry_tail *t;
> @@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
> static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> 			     struct inode *dir, struct inode *inode)
> {
> -	struct dx_frame frames[2], *frame;
> +	struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
> 	struct dx_entry *entries, *at;
> 	struct buffer_head *bh;
> 	struct super_block *sb = dir->i_sb;
> 	struct ext4_dir_entry_2 *de;
> +	int restart;
> 	int err;
> 
> +again:
> +	restart = 0;
> 	frame = dx_probe(fname, dir, NULL, frames);
> 	if (IS_ERR(frame))
> 		return PTR_ERR(frame);
> @@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> 	if (err != -ENOSPC)
> 		goto cleanup;
> 
> +	err = 0;
> 	/* Block full, should compress but for now just split */
> 	dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
> 		       dx_get_count(entries), dx_get_limit(entries)));
> 	/* Need to split index? */
> 	if (dx_get_count(entries) == dx_get_limit(entries)) {
> 		ext4_lblk_t newblock;
> -		unsigned icount = dx_get_count(entries);
> -		int levels = frame - frames;
> +		int levels = frame - frames + 1;
> +		unsigned int icount;
> +		int add_level = 1;
> 		struct dx_entry *entries2;
> 		struct dx_node *node2;
> 		struct buffer_head *bh2;
> 
> -		if (levels && (dx_get_count(frames->entries) ==
> -			       dx_get_limit(frames->entries))) {
> -			ext4_warning_inode(dir, "Directory index full!");
> +		while (frame > frames) {
> +			if (dx_get_count((frame - 1)->entries) <
> +			    dx_get_limit((frame - 1)->entries)) {
> +				add_level = 0;
> +				break;
> +			}
> +			frame--; /* split higher index block */
> +			at = frame->at;
> +			entries = frame->entries;
> +			restart = 1;
> +		}
> +		if (add_level && levels == ext4_dir_htree_level(sb)) {
> +			ext4_warning(sb, "Directory (ino: %lu) index full, "
> +					 "reach max htree level :%d",
> +					 dir->i_ino, levels);
> +			if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
> +				ext4_warning(sb, "Large directory feature is "
> +						 "not enabled on this "
> +						 "filesystem");
> +			}
> 			err = -ENOSPC;
> 			goto cleanup;
> 		}
> +		icount = dx_get_count(entries);
> 		bh2 = ext4_append(handle, dir, &newblock);
> 		if (IS_ERR(bh2)) {
> 			err = PTR_ERR(bh2);
> @@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> 		err = ext4_journal_get_write_access(handle, frame->bh);
> 		if (err)
> 			goto journal_error;
> -		if (levels) {
> +		if (!add_level) {
> 			unsigned icount1 = icount/2, icount2 = icount - icount1;
> 			unsigned hash2 = dx_get_hash(entries + icount1);
> 			dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
> @@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> 
> 			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
> 			err = ext4_journal_get_write_access(handle,
> -							     frames[0].bh);
> +							     (frame - 1)->bh);
> 			if (err)
> 				goto journal_error;
> 
> @@ -2269,17 +2306,23 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> 				frame->entries = entries = entries2;
> 				swap(frame->bh, bh2);
> 			}
> -			dx_insert_block(frames + 0, hash2, newblock);
> -			dxtrace(dx_show_index("node", frames[1].entries));
> +			dx_insert_block((frame - 1), hash2, newblock);
> +			dxtrace(dx_show_index("node", frame->entries));
> 			dxtrace(dx_show_index("node",
> 			       ((struct dx_node *) bh2->b_data)->entries));
> 			err = ext4_handle_dirty_dx_node(handle, dir, bh2);
> 			if (err)
> 				goto journal_error;
> 			brelse (bh2);
> +			ext4_handle_dirty_metadata(handle, dir,
> +						   (frame - 1)->bh);
> +			if (restart) {
> +				ext4_handle_dirty_metadata(handle, dir,
> +							   frame->bh);
> +				goto cleanup;
> +			}
> 		} else {
> -			dxtrace(printk(KERN_DEBUG
> -				       "Creating second level index...\n"));
> +			struct dx_root *dxroot;
> 			memcpy((char *) entries2, (char *) entries,
> 			       icount * sizeof(struct dx_entry));
> 			dx_set_limit(entries2, dx_node_limit(dir));
> @@ -2287,19 +2330,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> 			/* Set up root */
> 			dx_set_count(entries, 1);
> 			dx_set_block(entries + 0, newblock);
> -			((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
> -
> -			/* Add new access path frame */
> -			frame = frames + 1;
> -			frame->at = at = at - entries + entries2;
> -			frame->entries = entries = entries2;
> -			frame->bh = bh2;
> -			err = ext4_journal_get_write_access(handle,
> -							     frame->bh);
> -			if (err)
> -				goto journal_error;
> +			dxroot = (struct dx_root *)frames[0].bh->b_data;
> +			dxroot->info.indirect_levels += 1;
> +			dxtrace(printk(KERN_DEBUG
> +				       "Creating %d level index...\n",
> +				       info->indirect_levels));
> +			ext4_handle_dirty_metadata(handle, dir, frame->bh);
> +			ext4_handle_dirty_metadata(handle, dir, bh2);
> +			brelse(bh2);
> +			restart = 1;
> +			goto cleanup;
> 		}
> -		err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
> 		if (err) {
> 			ext4_std_error(inode->i_sb, err);
> 			goto cleanup;
> @@ -2318,6 +2359,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
> cleanup:
> 	brelse(bh);
> 	dx_release(frames);
> +	/* @restart is true means htree-path has been changed, we need to
> +	 * repeat dx_probe() to find out valid htree-path
> +	 */
> +	if (restart && err == 0)
> +		goto again;
> 	return err;
> }
> 
> @@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle,
> 					blocksize);
> 			else
> 				de->inode = 0;
> -			dir->i_version++;
> +			inode_inc_iversion(dir);
> 			return 0;
> 		}
> 		i += ext4_rec_len_from_disk(de->rec_len, blocksize);
> --
> 1.8.3.1
> 


Cheers, Andreas
Alexey Lyahkov March 17, 2017, 6:15 a.m. | #2
Andreas,

I not strongly against it patch, but my testing say - 3 level h-tree need a more than 20mil files in directory, and creation rate was dropped dramatically,
a specially without parallel dir operations, Did we need do some other research changes with landing it patch as disk format will changed anyway by incompatible way ?


> 17 марта 2017 г., в 0:44, Andreas Dilger <adilger@dilger.ca> написал(а):
> 
> On Mar 16, 2017, at 3:51 AM, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote:
>> 
>> From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
>> 
>> This INCOMPAT_LARGEDIR feature allows larger directories to be created
>> in ldiskfs, both with directory sizes over 2GB and and a maximum htree
>> depth of 3 instead of the current limit of 2. These features are needed
>> in order to exceed the current limit of approximately 10M entries in a
>> single directory.
> 
> Artem,
> many thanks for updating and submitting this, and creating the e2fsck patches.
> 
> Ted,
> can you please add the original author for the largedir ext4 patch when
> landing this patch: Liang Zhen <liang.zhen@intel.com>
> 
> and these tags which contain more background information on this feature:
> 
> Lustre-change: https://review.hpdd.intel.com/375
> Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-50
> 
>> Signed-off-by: Yang Sheng <yang.sheng@intel.com>
>> Signed-off-by: Artem Blagodarenko <artem.blagodarenko@seagate.com>
> 
> Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
> 
>> ---
>> fs/ext4/ext4.h  |  23 ++++++++---
>> fs/ext4/inode.c |   4 +-
>> fs/ext4/namei.c | 118 +++++++++++++++++++++++++++++++++++++++-----------------
>> 3 files changed, 102 insertions(+), 43 deletions(-)
>> 
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 01d52b9..0bbbd9b 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
>> 					 EXT4_FEATURE_INCOMPAT_MMP | \
>> 					 EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
>> 					 EXT4_FEATURE_INCOMPAT_ENCRYPT | \
>> -					 EXT4_FEATURE_INCOMPAT_CSUM_SEED)
>> +					 EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
>> +					 EXT4_FEATURE_INCOMPAT_LARGEDIR)
>> #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
>> 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
>> 					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
>> @@ -2125,6 +2126,16 @@ struct dir_private_info {
>> */
>> #define ERR_BAD_DX_DIR	(-(MAX_ERRNO - 1))
>> 
>> +/* htree levels for ext4 */
>> +#define	EXT4_HTREE_LEVEL_COMPAT	2
>> +#define	EXT4_HTREE_LEVEL	3
>> +
>> +static inline int ext4_dir_htree_level(struct super_block *sb)
>> +{
>> +	return ext4_has_feature_largedir(sb) ?
>> +		EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
>> +}
>> +
>> /*
>> * Timeout and state flag for lazy initialization inode thread.
>> */
>> @@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
>> 	es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
>> }
>> 
>> -static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
>> +static inline loff_t ext4_isize(struct super_block *sb,
>> +				struct ext4_inode *raw_inode)
>> {
>> -	if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
>> +	if (ext4_has_feature_largedir(sb) ||
>> +	    S_ISREG(le16_to_cpu(raw_inode->i_mode)))
>> 		return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
>> 			le32_to_cpu(raw_inode->i_size_lo);
>> -	else
>> -		return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
>> +
>> +	return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
>> }
>> 
>> static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index f622d4a..5787f3d 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>> 	if (ext4_has_feature_64bit(sb))
>> 		ei->i_file_acl |=
>> 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
>> -	inode->i_size = ext4_isize(raw_inode);
>> +	inode->i_size = ext4_isize(sb, raw_inode);
>> 	if ((size = i_size_read(inode)) < 0) {
>> 		EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
>> 		ret = -EFSCORRUPTED;
>> @@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle,
>> 		raw_inode->i_file_acl_high =
>> 			cpu_to_le16(ei->i_file_acl >> 32);
>> 	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
>> -	if (ei->i_disksize != ext4_isize(raw_inode)) {
>> +	if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
>> 		ext4_isize_set(raw_inode, ei->i_disksize);
>> 		need_datasync = 1;
>> 	}
>> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
>> index 6ad612c..3298fe3 100644
>> --- a/fs/ext4/namei.c
>> +++ b/fs/ext4/namei.c
>> @@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle,
>> 
>> static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
>> {
>> -	return le32_to_cpu(entry->block) & 0x00ffffff;
>> +	return le32_to_cpu(entry->block) & 0x0fffffff;
>> }
> 
> It wouldn't be terrible to add a comment to this function explaining why
> the block numbers are masked off, which is to allow the high bits of the
> logical block number to hold a "fullness" fraction so that the kernel
> could start shrinking directories if many files are deleted.  That said,
> this isn't a shortcoming of this patch, but a suggestion if the patch is
> being resubmitted for some other reason.
> 
>> static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
>> @@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>> 	struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
>> 	u32 hash;
>> 
>> +	memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
>> 	frame->bh = ext4_read_dirblock(dir, 0, INDEX);
>> 	if (IS_ERR(frame->bh))
>> 		return (struct dx_frame *) frame->bh;
>> @@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>> 	}
>> 
>> 	indirect = root->info.indirect_levels;
>> -	if (indirect > 1) {
>> -		ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
>> -				   root->info.indirect_levels);
>> +	if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
>> +		ext4_warning(dir->i_sb,
>> +			     "Directory (ino: %lu) htree depth %#06x exceed"
>> +			     "supported value", dir->i_ino,
>> +			     ext4_dir_htree_level(dir->i_sb));
>> +		if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
>> +			ext4_warning(dir->i_sb, "Enable large directory "
>> +						"feature to access it");
>> +		}
>> 		goto fail;
>> 	}
>> 
>> @@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>> 
>> static void dx_release(struct dx_frame *frames)
>> {
>> +	struct dx_root_info *info;
>> +	int i;
>> +
>> 	if (frames[0].bh == NULL)
>> 		return;
>> 
>> -	if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
>> -		brelse(frames[1].bh);
>> -	brelse(frames[0].bh);
>> +	info = &((struct dx_root *)frames[0].bh->b_data)->info;
>> +	for (i = 0; i <= info->indirect_levels; i++) {
>> +		if (frames[i].bh == NULL)
>> +			break;
>> +		brelse(frames[i].bh);
>> +		frames[i].bh = NULL;
>> +	}
>> }
>> 
>> /*
>> @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
>> {
>> 	struct dx_hash_info hinfo;
>> 	struct ext4_dir_entry_2 *de;
>> -	struct dx_frame frames[2], *frame;
>> +	struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>> 	struct inode *dir;
>> 	ext4_lblk_t block;
>> 	int count = 0;
>> @@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
>> 			struct ext4_dir_entry_2 **res_dir)
>> {
>> 	struct super_block * sb = dir->i_sb;
>> -	struct dx_frame frames[2], *frame;
>> +	struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>> 	const struct qstr *d_name = fname->usr_fname;
>> 	struct buffer_head *bh;
>> 	ext4_lblk_t block;
>> @@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
>> 	 */
>> 	dir->i_mtime = dir->i_ctime = current_time(dir);
>> 	ext4_update_dx_flag(dir);
>> -	dir->i_version++;
>> +	inode_inc_iversion(dir);
>> 	ext4_mark_inode_dirty(handle, dir);
>> 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
>> 	err = ext4_handle_dirty_dirent_node(handle, dir, bh);
>> @@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
>> {
>> 	struct buffer_head *bh2;
>> 	struct dx_root	*root;
>> -	struct dx_frame	frames[2], *frame;
>> +	struct dx_frame	frames[EXT4_HTREE_LEVEL], *frame;
>> 	struct dx_entry *entries;
>> 	struct ext4_dir_entry_2	*de, *de2;
>> 	struct ext4_dir_entry_tail *t;
>> @@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
>> static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>> 			     struct inode *dir, struct inode *inode)
>> {
>> -	struct dx_frame frames[2], *frame;
>> +	struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>> 	struct dx_entry *entries, *at;
>> 	struct buffer_head *bh;
>> 	struct super_block *sb = dir->i_sb;
>> 	struct ext4_dir_entry_2 *de;
>> +	int restart;
>> 	int err;
>> 
>> +again:
>> +	restart = 0;
>> 	frame = dx_probe(fname, dir, NULL, frames);
>> 	if (IS_ERR(frame))
>> 		return PTR_ERR(frame);
>> @@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>> 	if (err != -ENOSPC)
>> 		goto cleanup;
>> 
>> +	err = 0;
>> 	/* Block full, should compress but for now just split */
>> 	dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
>> 		       dx_get_count(entries), dx_get_limit(entries)));
>> 	/* Need to split index? */
>> 	if (dx_get_count(entries) == dx_get_limit(entries)) {
>> 		ext4_lblk_t newblock;
>> -		unsigned icount = dx_get_count(entries);
>> -		int levels = frame - frames;
>> +		int levels = frame - frames + 1;
>> +		unsigned int icount;
>> +		int add_level = 1;
>> 		struct dx_entry *entries2;
>> 		struct dx_node *node2;
>> 		struct buffer_head *bh2;
>> 
>> -		if (levels && (dx_get_count(frames->entries) ==
>> -			       dx_get_limit(frames->entries))) {
>> -			ext4_warning_inode(dir, "Directory index full!");
>> +		while (frame > frames) {
>> +			if (dx_get_count((frame - 1)->entries) <
>> +			    dx_get_limit((frame - 1)->entries)) {
>> +				add_level = 0;
>> +				break;
>> +			}
>> +			frame--; /* split higher index block */
>> +			at = frame->at;
>> +			entries = frame->entries;
>> +			restart = 1;
>> +		}
>> +		if (add_level && levels == ext4_dir_htree_level(sb)) {
>> +			ext4_warning(sb, "Directory (ino: %lu) index full, "
>> +					 "reach max htree level :%d",
>> +					 dir->i_ino, levels);
>> +			if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
>> +				ext4_warning(sb, "Large directory feature is "
>> +						 "not enabled on this "
>> +						 "filesystem");
>> +			}
>> 			err = -ENOSPC;
>> 			goto cleanup;
>> 		}
>> +		icount = dx_get_count(entries);
>> 		bh2 = ext4_append(handle, dir, &newblock);
>> 		if (IS_ERR(bh2)) {
>> 			err = PTR_ERR(bh2);
>> @@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>> 		err = ext4_journal_get_write_access(handle, frame->bh);
>> 		if (err)
>> 			goto journal_error;
>> -		if (levels) {
>> +		if (!add_level) {
>> 			unsigned icount1 = icount/2, icount2 = icount - icount1;
>> 			unsigned hash2 = dx_get_hash(entries + icount1);
>> 			dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
>> @@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>> 
>> 			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
>> 			err = ext4_journal_get_write_access(handle,
>> -							     frames[0].bh);
>> +							     (frame - 1)->bh);
>> 			if (err)
>> 				goto journal_error;
>> 
>> @@ -2269,17 +2306,23 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>> 				frame->entries = entries = entries2;
>> 				swap(frame->bh, bh2);
>> 			}
>> -			dx_insert_block(frames + 0, hash2, newblock);
>> -			dxtrace(dx_show_index("node", frames[1].entries));
>> +			dx_insert_block((frame - 1), hash2, newblock);
>> +			dxtrace(dx_show_index("node", frame->entries));
>> 			dxtrace(dx_show_index("node",
>> 			       ((struct dx_node *) bh2->b_data)->entries));
>> 			err = ext4_handle_dirty_dx_node(handle, dir, bh2);
>> 			if (err)
>> 				goto journal_error;
>> 			brelse (bh2);
>> +			ext4_handle_dirty_metadata(handle, dir,
>> +						   (frame - 1)->bh);
>> +			if (restart) {
>> +				ext4_handle_dirty_metadata(handle, dir,
>> +							   frame->bh);
>> +				goto cleanup;
>> +			}
>> 		} else {
>> -			dxtrace(printk(KERN_DEBUG
>> -				       "Creating second level index...\n"));
>> +			struct dx_root *dxroot;
>> 			memcpy((char *) entries2, (char *) entries,
>> 			       icount * sizeof(struct dx_entry));
>> 			dx_set_limit(entries2, dx_node_limit(dir));
>> @@ -2287,19 +2330,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>> 			/* Set up root */
>> 			dx_set_count(entries, 1);
>> 			dx_set_block(entries + 0, newblock);
>> -			((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
>> -
>> -			/* Add new access path frame */
>> -			frame = frames + 1;
>> -			frame->at = at = at - entries + entries2;
>> -			frame->entries = entries = entries2;
>> -			frame->bh = bh2;
>> -			err = ext4_journal_get_write_access(handle,
>> -							     frame->bh);
>> -			if (err)
>> -				goto journal_error;
>> +			dxroot = (struct dx_root *)frames[0].bh->b_data;
>> +			dxroot->info.indirect_levels += 1;
>> +			dxtrace(printk(KERN_DEBUG
>> +				       "Creating %d level index...\n",
>> +				       info->indirect_levels));
>> +			ext4_handle_dirty_metadata(handle, dir, frame->bh);
>> +			ext4_handle_dirty_metadata(handle, dir, bh2);
>> +			brelse(bh2);
>> +			restart = 1;
>> +			goto cleanup;
>> 		}
>> -		err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
>> 		if (err) {
>> 			ext4_std_error(inode->i_sb, err);
>> 			goto cleanup;
>> @@ -2318,6 +2359,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>> cleanup:
>> 	brelse(bh);
>> 	dx_release(frames);
>> +	/* @restart is true means htree-path has been changed, we need to
>> +	 * repeat dx_probe() to find out valid htree-path
>> +	 */
>> +	if (restart && err == 0)
>> +		goto again;
>> 	return err;
>> }
>> 
>> @@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle,
>> 					blocksize);
>> 			else
>> 				de->inode = 0;
>> -			dir->i_version++;
>> +			inode_inc_iversion(dir);
>> 			return 0;
>> 		}
>> 		i += ext4_rec_len_from_disk(de->rec_len, blocksize);
>> --
>> 1.8.3.1
>> 
> 
> 
> Cheers, Andreas
> 
> 
> 
> 
>
Andreas Dilger March 17, 2017, 8:51 p.m. | #3
On Mar 17, 2017, at 12:15 AM, Alexey Lyashkov <alexey.lyashkov@gmail.com> wrote:
> 
> Andreas,
> 
> I not strongly against it patch, but my testing say - 3 level h-tree need a more than 20mil files in directory, and creation rate was dropped dramatically,
> a specially without parallel dir operations, Did we need do some other research changes with landing it patch as disk format will changed anyway by incompatible way ?

Hi Alexey,
I'm not against further improvements to large directory handling (e.g.
directory shrinking, implement pdirops through VFS, etc.) for ext4.
That said, anything that changes the disk format should use a different
feature flag than EXT4_FEATURE_INCOMPAT_LARGEDIR, since we have been
using this flag in the Lustre code for several years already.

I think for very large directories that directory shrinking can be quite
useful, and could actually be implemented without a format change since
htree always masks off the top byte ("fullness ratio") of the logical block
number for this reason, and removing leaf blocks does not change the other
parts of the htree on-disk format.  This could still be done (in a slightly
less efficient manner) even without "fullness ratio" (i.e. if fullness == 0,
unset from older versions of ext4/e2fsck).  This would also help performance
noticeably if a directory has many files deleted, as the htree index will
shrink, and operations like readdir() will not have to skip empty blocks.

IMHO, I think the 3-level htree maximum limits are as far as we need to
go for ext4.  With 4KB blocks this gives us a maximum directory size
of about 5B entries, which is more files than can fit in the filesystem,
Even for 1KB blocks the maximum directory size is about 8M entries, which
is fine since 1KB blocksize is mostly only used for small filesystems, yet
is still as good as what a 4KB filesystem can do today.

Cheers, Andreas

>> 17 марта 2017 г., в 0:44, Andreas Dilger <adilger@dilger.ca> написал(а):
>> 
>> On Mar 16, 2017, at 3:51 AM, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote:
>>> 
>>> From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
>>> 
>>> This INCOMPAT_LARGEDIR feature allows larger directories to be created
>>> in ldiskfs, both with directory sizes over 2GB and and a maximum htree
>>> depth of 3 instead of the current limit of 2. These features are needed
>>> in order to exceed the current limit of approximately 10M entries in a
>>> single directory.
>> 
>> Artem,
>> many thanks for updating and submitting this, and creating the e2fsck patches.
>> 
>> Ted,
>> can you please add the original author for the largedir ext4 patch when
>> landing this patch: Liang Zhen <liang.zhen@intel.com>
>> 
>> and these tags which contain more background information on this feature:
>> 
>> Lustre-change: https://review.hpdd.intel.com/375
>> Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-50
>> 
>>> Signed-off-by: Yang Sheng <yang.sheng@intel.com>
>>> Signed-off-by: Artem Blagodarenko <artem.blagodarenko@seagate.com>
>> 
>> Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
>> 
>>> ---
>>> fs/ext4/ext4.h  |  23 ++++++++---
>>> fs/ext4/inode.c |   4 +-
>>> fs/ext4/namei.c | 118 +++++++++++++++++++++++++++++++++++++++-----------------
>>> 3 files changed, 102 insertions(+), 43 deletions(-)
>>> 
>>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>>> index 01d52b9..0bbbd9b 100644
>>> --- a/fs/ext4/ext4.h
>>> +++ b/fs/ext4/ext4.h
>>> @@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
>>> 					 EXT4_FEATURE_INCOMPAT_MMP | \
>>> 					 EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
>>> 					 EXT4_FEATURE_INCOMPAT_ENCRYPT | \
>>> -					 EXT4_FEATURE_INCOMPAT_CSUM_SEED)
>>> +					 EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
>>> +					 EXT4_FEATURE_INCOMPAT_LARGEDIR)
>>> #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
>>> 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
>>> 					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
>>> @@ -2125,6 +2126,16 @@ struct dir_private_info {
>>> */
>>> #define ERR_BAD_DX_DIR	(-(MAX_ERRNO - 1))
>>> 
>>> +/* htree levels for ext4 */
>>> +#define	EXT4_HTREE_LEVEL_COMPAT	2
>>> +#define	EXT4_HTREE_LEVEL	3
>>> +
>>> +static inline int ext4_dir_htree_level(struct super_block *sb)
>>> +{
>>> +	return ext4_has_feature_largedir(sb) ?
>>> +		EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
>>> +}
>>> +
>>> /*
>>> * Timeout and state flag for lazy initialization inode thread.
>>> */
>>> @@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
>>> 	es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
>>> }
>>> 
>>> -static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
>>> +static inline loff_t ext4_isize(struct super_block *sb,
>>> +				struct ext4_inode *raw_inode)
>>> {
>>> -	if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
>>> +	if (ext4_has_feature_largedir(sb) ||
>>> +	    S_ISREG(le16_to_cpu(raw_inode->i_mode)))
>>> 		return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
>>> 			le32_to_cpu(raw_inode->i_size_lo);
>>> -	else
>>> -		return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
>>> +
>>> +	return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
>>> }
>>> 
>>> static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
>>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>>> index f622d4a..5787f3d 100644
>>> --- a/fs/ext4/inode.c
>>> +++ b/fs/ext4/inode.c
>>> @@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>>> 	if (ext4_has_feature_64bit(sb))
>>> 		ei->i_file_acl |=
>>> 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
>>> -	inode->i_size = ext4_isize(raw_inode);
>>> +	inode->i_size = ext4_isize(sb, raw_inode);
>>> 	if ((size = i_size_read(inode)) < 0) {
>>> 		EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
>>> 		ret = -EFSCORRUPTED;
>>> @@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle,
>>> 		raw_inode->i_file_acl_high =
>>> 			cpu_to_le16(ei->i_file_acl >> 32);
>>> 	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
>>> -	if (ei->i_disksize != ext4_isize(raw_inode)) {
>>> +	if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
>>> 		ext4_isize_set(raw_inode, ei->i_disksize);
>>> 		need_datasync = 1;
>>> 	}
>>> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
>>> index 6ad612c..3298fe3 100644
>>> --- a/fs/ext4/namei.c
>>> +++ b/fs/ext4/namei.c
>>> @@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle,
>>> 
>>> static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
>>> {
>>> -	return le32_to_cpu(entry->block) & 0x00ffffff;
>>> +	return le32_to_cpu(entry->block) & 0x0fffffff;
>>> }
>> 
>> It wouldn't be terrible to add a comment to this function explaining why
>> the block numbers are masked off, which is to allow the high bits of the
>> logical block number to hold a "fullness" fraction so that the kernel
>> could start shrinking directories if many files are deleted.  That said,
>> this isn't a shortcoming of this patch, but a suggestion if the patch is
>> being resubmitted for some other reason.
>> 
>>> static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
>>> @@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>>> 	struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
>>> 	u32 hash;
>>> 
>>> +	memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
>>> 	frame->bh = ext4_read_dirblock(dir, 0, INDEX);
>>> 	if (IS_ERR(frame->bh))
>>> 		return (struct dx_frame *) frame->bh;
>>> @@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>>> 	}
>>> 
>>> 	indirect = root->info.indirect_levels;
>>> -	if (indirect > 1) {
>>> -		ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
>>> -				   root->info.indirect_levels);
>>> +	if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
>>> +		ext4_warning(dir->i_sb,
>>> +			     "Directory (ino: %lu) htree depth %#06x exceed"
>>> +			     "supported value", dir->i_ino,
>>> +			     ext4_dir_htree_level(dir->i_sb));
>>> +		if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
>>> +			ext4_warning(dir->i_sb, "Enable large directory "
>>> +						"feature to access it");
>>> +		}
>>> 		goto fail;
>>> 	}
>>> 
>>> @@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>>> 
>>> static void dx_release(struct dx_frame *frames)
>>> {
>>> +	struct dx_root_info *info;
>>> +	int i;
>>> +
>>> 	if (frames[0].bh == NULL)
>>> 		return;
>>> 
>>> -	if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
>>> -		brelse(frames[1].bh);
>>> -	brelse(frames[0].bh);
>>> +	info = &((struct dx_root *)frames[0].bh->b_data)->info;
>>> +	for (i = 0; i <= info->indirect_levels; i++) {
>>> +		if (frames[i].bh == NULL)
>>> +			break;
>>> +		brelse(frames[i].bh);
>>> +		frames[i].bh = NULL;
>>> +	}
>>> }
>>> 
>>> /*
>>> @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
>>> {
>>> 	struct dx_hash_info hinfo;
>>> 	struct ext4_dir_entry_2 *de;
>>> -	struct dx_frame frames[2], *frame;
>>> +	struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>>> 	struct inode *dir;
>>> 	ext4_lblk_t block;
>>> 	int count = 0;
>>> @@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
>>> 			struct ext4_dir_entry_2 **res_dir)
>>> {
>>> 	struct super_block * sb = dir->i_sb;
>>> -	struct dx_frame frames[2], *frame;
>>> +	struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>>> 	const struct qstr *d_name = fname->usr_fname;
>>> 	struct buffer_head *bh;
>>> 	ext4_lblk_t block;
>>> @@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
>>> 	 */
>>> 	dir->i_mtime = dir->i_ctime = current_time(dir);
>>> 	ext4_update_dx_flag(dir);
>>> -	dir->i_version++;
>>> +	inode_inc_iversion(dir);
>>> 	ext4_mark_inode_dirty(handle, dir);
>>> 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
>>> 	err = ext4_handle_dirty_dirent_node(handle, dir, bh);
>>> @@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
>>> {
>>> 	struct buffer_head *bh2;
>>> 	struct dx_root	*root;
>>> -	struct dx_frame	frames[2], *frame;
>>> +	struct dx_frame	frames[EXT4_HTREE_LEVEL], *frame;
>>> 	struct dx_entry *entries;
>>> 	struct ext4_dir_entry_2	*de, *de2;
>>> 	struct ext4_dir_entry_tail *t;
>>> @@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
>>> static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>> 			     struct inode *dir, struct inode *inode)
>>> {
>>> -	struct dx_frame frames[2], *frame;
>>> +	struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>>> 	struct dx_entry *entries, *at;
>>> 	struct buffer_head *bh;
>>> 	struct super_block *sb = dir->i_sb;
>>> 	struct ext4_dir_entry_2 *de;
>>> +	int restart;
>>> 	int err;
>>> 
>>> +again:
>>> +	restart = 0;
>>> 	frame = dx_probe(fname, dir, NULL, frames);
>>> 	if (IS_ERR(frame))
>>> 		return PTR_ERR(frame);
>>> @@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>> 	if (err != -ENOSPC)
>>> 		goto cleanup;
>>> 
>>> +	err = 0;
>>> 	/* Block full, should compress but for now just split */
>>> 	dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
>>> 		       dx_get_count(entries), dx_get_limit(entries)));
>>> 	/* Need to split index? */
>>> 	if (dx_get_count(entries) == dx_get_limit(entries)) {
>>> 		ext4_lblk_t newblock;
>>> -		unsigned icount = dx_get_count(entries);
>>> -		int levels = frame - frames;
>>> +		int levels = frame - frames + 1;
>>> +		unsigned int icount;
>>> +		int add_level = 1;
>>> 		struct dx_entry *entries2;
>>> 		struct dx_node *node2;
>>> 		struct buffer_head *bh2;
>>> 
>>> -		if (levels && (dx_get_count(frames->entries) ==
>>> -			       dx_get_limit(frames->entries))) {
>>> -			ext4_warning_inode(dir, "Directory index full!");
>>> +		while (frame > frames) {
>>> +			if (dx_get_count((frame - 1)->entries) <
>>> +			    dx_get_limit((frame - 1)->entries)) {
>>> +				add_level = 0;
>>> +				break;
>>> +			}
>>> +			frame--; /* split higher index block */
>>> +			at = frame->at;
>>> +			entries = frame->entries;
>>> +			restart = 1;
>>> +		}
>>> +		if (add_level && levels == ext4_dir_htree_level(sb)) {
>>> +			ext4_warning(sb, "Directory (ino: %lu) index full, "
>>> +					 "reach max htree level :%d",
>>> +					 dir->i_ino, levels);
>>> +			if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
>>> +				ext4_warning(sb, "Large directory feature is "
>>> +						 "not enabled on this "
>>> +						 "filesystem");
>>> +			}
>>> 			err = -ENOSPC;
>>> 			goto cleanup;
>>> 		}
>>> +		icount = dx_get_count(entries);
>>> 		bh2 = ext4_append(handle, dir, &newblock);
>>> 		if (IS_ERR(bh2)) {
>>> 			err = PTR_ERR(bh2);
>>> @@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>> 		err = ext4_journal_get_write_access(handle, frame->bh);
>>> 		if (err)
>>> 			goto journal_error;
>>> -		if (levels) {
>>> +		if (!add_level) {
>>> 			unsigned icount1 = icount/2, icount2 = icount - icount1;
>>> 			unsigned hash2 = dx_get_hash(entries + icount1);
>>> 			dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
>>> @@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>> 
>>> 			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
>>> 			err = ext4_journal_get_write_access(handle,
>>> -							     frames[0].bh);
>>> +							     (frame - 1)->bh);
>>> 			if (err)
>>> 				goto journal_error;
>>> 
>>> @@ -2269,17 +2306,23 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>> 				frame->entries = entries = entries2;
>>> 				swap(frame->bh, bh2);
>>> 			}
>>> -			dx_insert_block(frames + 0, hash2, newblock);
>>> -			dxtrace(dx_show_index("node", frames[1].entries));
>>> +			dx_insert_block((frame - 1), hash2, newblock);
>>> +			dxtrace(dx_show_index("node", frame->entries));
>>> 			dxtrace(dx_show_index("node",
>>> 			       ((struct dx_node *) bh2->b_data)->entries));
>>> 			err = ext4_handle_dirty_dx_node(handle, dir, bh2);
>>> 			if (err)
>>> 				goto journal_error;
>>> 			brelse (bh2);
>>> +			ext4_handle_dirty_metadata(handle, dir,
>>> +						   (frame - 1)->bh);
>>> +			if (restart) {
>>> +				ext4_handle_dirty_metadata(handle, dir,
>>> +							   frame->bh);
>>> +				goto cleanup;
>>> +			}
>>> 		} else {
>>> -			dxtrace(printk(KERN_DEBUG
>>> -				       "Creating second level index...\n"));
>>> +			struct dx_root *dxroot;
>>> 			memcpy((char *) entries2, (char *) entries,
>>> 			       icount * sizeof(struct dx_entry));
>>> 			dx_set_limit(entries2, dx_node_limit(dir));
>>> @@ -2287,19 +2330,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>> 			/* Set up root */
>>> 			dx_set_count(entries, 1);
>>> 			dx_set_block(entries + 0, newblock);
>>> -			((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
>>> -
>>> -			/* Add new access path frame */
>>> -			frame = frames + 1;
>>> -			frame->at = at = at - entries + entries2;
>>> -			frame->entries = entries = entries2;
>>> -			frame->bh = bh2;
>>> -			err = ext4_journal_get_write_access(handle,
>>> -							     frame->bh);
>>> -			if (err)
>>> -				goto journal_error;
>>> +			dxroot = (struct dx_root *)frames[0].bh->b_data;
>>> +			dxroot->info.indirect_levels += 1;
>>> +			dxtrace(printk(KERN_DEBUG
>>> +				       "Creating %d level index...\n",
>>> +				       info->indirect_levels));
>>> +			ext4_handle_dirty_metadata(handle, dir, frame->bh);
>>> +			ext4_handle_dirty_metadata(handle, dir, bh2);
>>> +			brelse(bh2);
>>> +			restart = 1;
>>> +			goto cleanup;
>>> 		}
>>> -		err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
>>> 		if (err) {
>>> 			ext4_std_error(inode->i_sb, err);
>>> 			goto cleanup;
>>> @@ -2318,6 +2359,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>> cleanup:
>>> 	brelse(bh);
>>> 	dx_release(frames);
>>> +	/* @restart is true means htree-path has been changed, we need to
>>> +	 * repeat dx_probe() to find out valid htree-path
>>> +	 */
>>> +	if (restart && err == 0)
>>> +		goto again;
>>> 	return err;
>>> }
>>> 
>>> @@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle,
>>> 					blocksize);
>>> 			else
>>> 				de->inode = 0;
>>> -			dir->i_version++;
>>> +			inode_inc_iversion(dir);
>>> 			return 0;
>>> 		}
>>> 		i += ext4_rec_len_from_disk(de->rec_len, blocksize);
>>> --
>>> 1.8.3.1
>>> 
>> 
>> 
>> Cheers, Andreas
>> 
>> 
>> 
>> 
>> 
> 


Cheers, Andreas
Alexey Lyahkov March 18, 2017, 8:16 a.m. | #4
Andreas,

it not about a feature flag. It’s about a situation in whole.
Yes, we may increase a directory size, but it open a number a large problems.
1) readdir. It tries to read all entries in memory before send to the user. currently it may eats 20*10^6 * 256 so several gigs, so increasing it size may produce a problems for a system.
2) inode allocation. Current code tries to allocate an inode as near as possible to the directory inode, but one GD may hold 32k entries only, so increase a directory size will use up 1k GD for now and more than it, after it. It increase a seek time with file allocation. It was i mean when say - «dramatically decrease a file creation rate».
3) current limit with 4G inodes - currently 32-128 directories may eats a full inode number space. From it perspective large dir don’t need to be used.

Other improvements you point already. 

> 17 марта 2017 г., в 23:51, Andreas Dilger <adilger@dilger.ca> написал(а):
> 
> On Mar 17, 2017, at 12:15 AM, Alexey Lyashkov <alexey.lyashkov@gmail.com> wrote:
>> 
>> Andreas,
>> 
>> I not strongly against it patch, but my testing say - 3 level h-tree need a more than 20mil files in directory, and creation rate was dropped dramatically,
>> a specially without parallel dir operations, Did we need do some other research changes with landing it patch as disk format will changed anyway by incompatible way ?
> 
> Hi Alexey,
> I'm not against further improvements to large directory handling (e.g.
> directory shrinking, implement pdirops through VFS, etc.) for ext4.
> That said, anything that changes the disk format should use a different
> feature flag than EXT4_FEATURE_INCOMPAT_LARGEDIR, since we have been
> using this flag in the Lustre code for several years already.
> 
> I think for very large directories that directory shrinking can be quite
> useful, and could actually be implemented without a format change since
> htree always masks off the top byte ("fullness ratio") of the logical block
> number for this reason, and removing leaf blocks does not change the other
> parts of the htree on-disk format.  This could still be done (in a slightly
> less efficient manner) even without "fullness ratio" (i.e. if fullness == 0,
> unset from older versions of ext4/e2fsck).  This would also help performance
> noticeably if a directory has many files deleted, as the htree index will
> shrink, and operations like readdir() will not have to skip empty blocks.
> 
> IMHO, I think the 3-level htree maximum limits are as far as we need to
> go for ext4.  With 4KB blocks this gives us a maximum directory size
> of about 5B entries, which is more files than can fit in the filesystem,
> Even for 1KB blocks the maximum directory size is about 8M entries, which
> is fine since 1KB blocksize is mostly only used for small filesystems, yet
> is still as good as what a 4KB filesystem can do today.
> 
> Cheers, Andreas
> 
>>> 17 марта 2017 г., в 0:44, Andreas Dilger <adilger@dilger.ca> написал(а):
>>> 
>>> On Mar 16, 2017, at 3:51 AM, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote:
>>>> 
>>>> From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
>>>> 
>>>> This INCOMPAT_LARGEDIR feature allows larger directories to be created
>>>> in ldiskfs, both with directory sizes over 2GB and and a maximum htree
>>>> depth of 3 instead of the current limit of 2. These features are needed
>>>> in order to exceed the current limit of approximately 10M entries in a
>>>> single directory.
>>> 
>>> Artem,
>>> many thanks for updating and submitting this, and creating the e2fsck patches.
>>> 
>>> Ted,
>>> can you please add the original author for the largedir ext4 patch when
>>> landing this patch: Liang Zhen <liang.zhen@intel.com>
>>> 
>>> and these tags which contain more background information on this feature:
>>> 
>>> Lustre-change: https://review.hpdd.intel.com/375
>>> Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-50
>>> 
>>>> Signed-off-by: Yang Sheng <yang.sheng@intel.com>
>>>> Signed-off-by: Artem Blagodarenko <artem.blagodarenko@seagate.com>
>>> 
>>> Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
>>> 
>>>> ---
>>>> fs/ext4/ext4.h  |  23 ++++++++---
>>>> fs/ext4/inode.c |   4 +-
>>>> fs/ext4/namei.c | 118 +++++++++++++++++++++++++++++++++++++++-----------------
>>>> 3 files changed, 102 insertions(+), 43 deletions(-)
>>>> 
>>>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>>>> index 01d52b9..0bbbd9b 100644
>>>> --- a/fs/ext4/ext4.h
>>>> +++ b/fs/ext4/ext4.h
>>>> @@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
>>>> 					 EXT4_FEATURE_INCOMPAT_MMP | \
>>>> 					 EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
>>>> 					 EXT4_FEATURE_INCOMPAT_ENCRYPT | \
>>>> -					 EXT4_FEATURE_INCOMPAT_CSUM_SEED)
>>>> +					 EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
>>>> +					 EXT4_FEATURE_INCOMPAT_LARGEDIR)
>>>> #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
>>>> 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
>>>> 					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
>>>> @@ -2125,6 +2126,16 @@ struct dir_private_info {
>>>> */
>>>> #define ERR_BAD_DX_DIR	(-(MAX_ERRNO - 1))
>>>> 
>>>> +/* htree levels for ext4 */
>>>> +#define	EXT4_HTREE_LEVEL_COMPAT	2
>>>> +#define	EXT4_HTREE_LEVEL	3
>>>> +
>>>> +static inline int ext4_dir_htree_level(struct super_block *sb)
>>>> +{
>>>> +	return ext4_has_feature_largedir(sb) ?
>>>> +		EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
>>>> +}
>>>> +
>>>> /*
>>>> * Timeout and state flag for lazy initialization inode thread.
>>>> */
>>>> @@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
>>>> 	es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
>>>> }
>>>> 
>>>> -static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
>>>> +static inline loff_t ext4_isize(struct super_block *sb,
>>>> +				struct ext4_inode *raw_inode)
>>>> {
>>>> -	if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
>>>> +	if (ext4_has_feature_largedir(sb) ||
>>>> +	    S_ISREG(le16_to_cpu(raw_inode->i_mode)))
>>>> 		return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
>>>> 			le32_to_cpu(raw_inode->i_size_lo);
>>>> -	else
>>>> -		return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
>>>> +
>>>> +	return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
>>>> }
>>>> 
>>>> static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
>>>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>>>> index f622d4a..5787f3d 100644
>>>> --- a/fs/ext4/inode.c
>>>> +++ b/fs/ext4/inode.c
>>>> @@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>>>> 	if (ext4_has_feature_64bit(sb))
>>>> 		ei->i_file_acl |=
>>>> 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
>>>> -	inode->i_size = ext4_isize(raw_inode);
>>>> +	inode->i_size = ext4_isize(sb, raw_inode);
>>>> 	if ((size = i_size_read(inode)) < 0) {
>>>> 		EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
>>>> 		ret = -EFSCORRUPTED;
>>>> @@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle,
>>>> 		raw_inode->i_file_acl_high =
>>>> 			cpu_to_le16(ei->i_file_acl >> 32);
>>>> 	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
>>>> -	if (ei->i_disksize != ext4_isize(raw_inode)) {
>>>> +	if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
>>>> 		ext4_isize_set(raw_inode, ei->i_disksize);
>>>> 		need_datasync = 1;
>>>> 	}
>>>> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
>>>> index 6ad612c..3298fe3 100644
>>>> --- a/fs/ext4/namei.c
>>>> +++ b/fs/ext4/namei.c
>>>> @@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle,
>>>> 
>>>> static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
>>>> {
>>>> -	return le32_to_cpu(entry->block) & 0x00ffffff;
>>>> +	return le32_to_cpu(entry->block) & 0x0fffffff;
>>>> }
>>> 
>>> It wouldn't be terrible to add a comment to this function explaining why
>>> the block numbers are masked off, which is to allow the high bits of the
>>> logical block number to hold a "fullness" fraction so that the kernel
>>> could start shrinking directories if many files are deleted.  That said,
>>> this isn't a shortcoming of this patch, but a suggestion if the patch is
>>> being resubmitted for some other reason.
>>> 
>>>> static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
>>>> @@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>>>> 	struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
>>>> 	u32 hash;
>>>> 
>>>> +	memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
>>>> 	frame->bh = ext4_read_dirblock(dir, 0, INDEX);
>>>> 	if (IS_ERR(frame->bh))
>>>> 		return (struct dx_frame *) frame->bh;
>>>> @@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>>>> 	}
>>>> 
>>>> 	indirect = root->info.indirect_levels;
>>>> -	if (indirect > 1) {
>>>> -		ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
>>>> -				   root->info.indirect_levels);
>>>> +	if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
>>>> +		ext4_warning(dir->i_sb,
>>>> +			     "Directory (ino: %lu) htree depth %#06x exceed"
>>>> +			     "supported value", dir->i_ino,
>>>> +			     ext4_dir_htree_level(dir->i_sb));
>>>> +		if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
>>>> +			ext4_warning(dir->i_sb, "Enable large directory "
>>>> +						"feature to access it");
>>>> +		}
>>>> 		goto fail;
>>>> 	}
>>>> 
>>>> @@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
>>>> 
>>>> static void dx_release(struct dx_frame *frames)
>>>> {
>>>> +	struct dx_root_info *info;
>>>> +	int i;
>>>> +
>>>> 	if (frames[0].bh == NULL)
>>>> 		return;
>>>> 
>>>> -	if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
>>>> -		brelse(frames[1].bh);
>>>> -	brelse(frames[0].bh);
>>>> +	info = &((struct dx_root *)frames[0].bh->b_data)->info;
>>>> +	for (i = 0; i <= info->indirect_levels; i++) {
>>>> +		if (frames[i].bh == NULL)
>>>> +			break;
>>>> +		brelse(frames[i].bh);
>>>> +		frames[i].bh = NULL;
>>>> +	}
>>>> }
>>>> 
>>>> /*
>>>> @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
>>>> {
>>>> 	struct dx_hash_info hinfo;
>>>> 	struct ext4_dir_entry_2 *de;
>>>> -	struct dx_frame frames[2], *frame;
>>>> +	struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>>>> 	struct inode *dir;
>>>> 	ext4_lblk_t block;
>>>> 	int count = 0;
>>>> @@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
>>>> 			struct ext4_dir_entry_2 **res_dir)
>>>> {
>>>> 	struct super_block * sb = dir->i_sb;
>>>> -	struct dx_frame frames[2], *frame;
>>>> +	struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>>>> 	const struct qstr *d_name = fname->usr_fname;
>>>> 	struct buffer_head *bh;
>>>> 	ext4_lblk_t block;
>>>> @@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
>>>> 	 */
>>>> 	dir->i_mtime = dir->i_ctime = current_time(dir);
>>>> 	ext4_update_dx_flag(dir);
>>>> -	dir->i_version++;
>>>> +	inode_inc_iversion(dir);
>>>> 	ext4_mark_inode_dirty(handle, dir);
>>>> 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
>>>> 	err = ext4_handle_dirty_dirent_node(handle, dir, bh);
>>>> @@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
>>>> {
>>>> 	struct buffer_head *bh2;
>>>> 	struct dx_root	*root;
>>>> -	struct dx_frame	frames[2], *frame;
>>>> +	struct dx_frame	frames[EXT4_HTREE_LEVEL], *frame;
>>>> 	struct dx_entry *entries;
>>>> 	struct ext4_dir_entry_2	*de, *de2;
>>>> 	struct ext4_dir_entry_tail *t;
>>>> @@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
>>>> static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>>> 			     struct inode *dir, struct inode *inode)
>>>> {
>>>> -	struct dx_frame frames[2], *frame;
>>>> +	struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
>>>> 	struct dx_entry *entries, *at;
>>>> 	struct buffer_head *bh;
>>>> 	struct super_block *sb = dir->i_sb;
>>>> 	struct ext4_dir_entry_2 *de;
>>>> +	int restart;
>>>> 	int err;
>>>> 
>>>> +again:
>>>> +	restart = 0;
>>>> 	frame = dx_probe(fname, dir, NULL, frames);
>>>> 	if (IS_ERR(frame))
>>>> 		return PTR_ERR(frame);
>>>> @@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>>> 	if (err != -ENOSPC)
>>>> 		goto cleanup;
>>>> 
>>>> +	err = 0;
>>>> 	/* Block full, should compress but for now just split */
>>>> 	dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
>>>> 		       dx_get_count(entries), dx_get_limit(entries)));
>>>> 	/* Need to split index? */
>>>> 	if (dx_get_count(entries) == dx_get_limit(entries)) {
>>>> 		ext4_lblk_t newblock;
>>>> -		unsigned icount = dx_get_count(entries);
>>>> -		int levels = frame - frames;
>>>> +		int levels = frame - frames + 1;
>>>> +		unsigned int icount;
>>>> +		int add_level = 1;
>>>> 		struct dx_entry *entries2;
>>>> 		struct dx_node *node2;
>>>> 		struct buffer_head *bh2;
>>>> 
>>>> -		if (levels && (dx_get_count(frames->entries) ==
>>>> -			       dx_get_limit(frames->entries))) {
>>>> -			ext4_warning_inode(dir, "Directory index full!");
>>>> +		while (frame > frames) {
>>>> +			if (dx_get_count((frame - 1)->entries) <
>>>> +			    dx_get_limit((frame - 1)->entries)) {
>>>> +				add_level = 0;
>>>> +				break;
>>>> +			}
>>>> +			frame--; /* split higher index block */
>>>> +			at = frame->at;
>>>> +			entries = frame->entries;
>>>> +			restart = 1;
>>>> +		}
>>>> +		if (add_level && levels == ext4_dir_htree_level(sb)) {
>>>> +			ext4_warning(sb, "Directory (ino: %lu) index full, "
>>>> +					 "reach max htree level :%d",
>>>> +					 dir->i_ino, levels);
>>>> +			if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
>>>> +				ext4_warning(sb, "Large directory feature is "
>>>> +						 "not enabled on this "
>>>> +						 "filesystem");
>>>> +			}
>>>> 			err = -ENOSPC;
>>>> 			goto cleanup;
>>>> 		}
>>>> +		icount = dx_get_count(entries);
>>>> 		bh2 = ext4_append(handle, dir, &newblock);
>>>> 		if (IS_ERR(bh2)) {
>>>> 			err = PTR_ERR(bh2);
>>>> @@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>>> 		err = ext4_journal_get_write_access(handle, frame->bh);
>>>> 		if (err)
>>>> 			goto journal_error;
>>>> -		if (levels) {
>>>> +		if (!add_level) {
>>>> 			unsigned icount1 = icount/2, icount2 = icount - icount1;
>>>> 			unsigned hash2 = dx_get_hash(entries + icount1);
>>>> 			dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
>>>> @@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>>> 
>>>> 			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
>>>> 			err = ext4_journal_get_write_access(handle,
>>>> -							     frames[0].bh);
>>>> +							     (frame - 1)->bh);
>>>> 			if (err)
>>>> 				goto journal_error;
>>>> 
>>>> @@ -2269,17 +2306,23 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>>> 				frame->entries = entries = entries2;
>>>> 				swap(frame->bh, bh2);
>>>> 			}
>>>> -			dx_insert_block(frames + 0, hash2, newblock);
>>>> -			dxtrace(dx_show_index("node", frames[1].entries));
>>>> +			dx_insert_block((frame - 1), hash2, newblock);
>>>> +			dxtrace(dx_show_index("node", frame->entries));
>>>> 			dxtrace(dx_show_index("node",
>>>> 			       ((struct dx_node *) bh2->b_data)->entries));
>>>> 			err = ext4_handle_dirty_dx_node(handle, dir, bh2);
>>>> 			if (err)
>>>> 				goto journal_error;
>>>> 			brelse (bh2);
>>>> +			ext4_handle_dirty_metadata(handle, dir,
>>>> +						   (frame - 1)->bh);
>>>> +			if (restart) {
>>>> +				ext4_handle_dirty_metadata(handle, dir,
>>>> +							   frame->bh);
>>>> +				goto cleanup;
>>>> +			}
>>>> 		} else {
>>>> -			dxtrace(printk(KERN_DEBUG
>>>> -				       "Creating second level index...\n"));
>>>> +			struct dx_root *dxroot;
>>>> 			memcpy((char *) entries2, (char *) entries,
>>>> 			       icount * sizeof(struct dx_entry));
>>>> 			dx_set_limit(entries2, dx_node_limit(dir));
>>>> @@ -2287,19 +2330,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>>> 			/* Set up root */
>>>> 			dx_set_count(entries, 1);
>>>> 			dx_set_block(entries + 0, newblock);
>>>> -			((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
>>>> -
>>>> -			/* Add new access path frame */
>>>> -			frame = frames + 1;
>>>> -			frame->at = at = at - entries + entries2;
>>>> -			frame->entries = entries = entries2;
>>>> -			frame->bh = bh2;
>>>> -			err = ext4_journal_get_write_access(handle,
>>>> -							     frame->bh);
>>>> -			if (err)
>>>> -				goto journal_error;
>>>> +			dxroot = (struct dx_root *)frames[0].bh->b_data;
>>>> +			dxroot->info.indirect_levels += 1;
>>>> +			dxtrace(printk(KERN_DEBUG
>>>> +				       "Creating %d level index...\n",
>>>> +				       info->indirect_levels));
>>>> +			ext4_handle_dirty_metadata(handle, dir, frame->bh);
>>>> +			ext4_handle_dirty_metadata(handle, dir, bh2);
>>>> +			brelse(bh2);
>>>> +			restart = 1;
>>>> +			goto cleanup;
>>>> 		}
>>>> -		err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
>>>> 		if (err) {
>>>> 			ext4_std_error(inode->i_sb, err);
>>>> 			goto cleanup;
>>>> @@ -2318,6 +2359,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
>>>> cleanup:
>>>> 	brelse(bh);
>>>> 	dx_release(frames);
>>>> +	/* @restart is true means htree-path has been changed, we need to
>>>> +	 * repeat dx_probe() to find out valid htree-path
>>>> +	 */
>>>> +	if (restart && err == 0)
>>>> +		goto again;
>>>> 	return err;
>>>> }
>>>> 
>>>> @@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle,
>>>> 					blocksize);
>>>> 			else
>>>> 				de->inode = 0;
>>>> -			dir->i_version++;
>>>> +			inode_inc_iversion(dir);
>>>> 			return 0;
>>>> 		}
>>>> 		i += ext4_rec_len_from_disk(de->rec_len, blocksize);
>>>> --
>>>> 1.8.3.1
>>>> 
>>> 
>>> 
>>> Cheers, Andreas
>>> 
>>> 
>>> 
>>> 
>>> 
>> 
> 
> 
> Cheers, Andreas
> 
> 
> 
> 
>
Theodore Ts'o March 18, 2017, 4:29 p.m. | #5
On Sat, Mar 18, 2017 at 11:16:26AM +0300, Alexey Lyashkov wrote:
> Andreas,
> 
> it not about a feature flag. It’s about a situation in whole.
> Yes, we may increase a directory size, but it open a number a large problems.

> 1) readdir. It tries to read all entries in memory before send to
> the user. currently it may eats 20*10^6 * 256 so several gigs, so
> increasing it size may produce a problems for a system.

That's not true.  We normally only read in one block a time.  If there
is a hash collision, then we may need to insert into the rbtree in a
subsequent block's worth of dentries to make sure we have all of the
directory entries corresponding to a particular hash value.  I think
you misunderstood the code.

> 2) inode allocation. Current code tries to allocate an inode as near as possible to the directory inode, but one GD may hold 32k entries only, so increase a directory size will use up 1k GD for now and more than it, after it. It increase a seek time with file allocation. It was i mean when say - «dramatically decrease a file creation rate».

But there are also only 32k blocks in a group descriptor, and we try
to keep the blocks allocated close to the inode.  So if you are using
a huge directory, and you are using a storage device with a
significant seek penalty, then yes, no matter what as the directory
grows, the time to iterate over all of the files does grow.  But there
is more to life than microbenchmarks which creates huge numbers of zero
length files!  If we assume that the files are going to need to
contain _data_, and the data blocks should be close to the inodes,
then there are going to be some performance impacts no matter what.

> 3) current limit with 4G inodes - currently 32-128 directories may eats a full inode number space. From it perspective large dir don’t need to be used.

I can imagine a new feature flag which defines the use a 64-bit inode
number, but that's more for people who are creating a file system that
takes advantage of 64-bit block numbers, and they are intending on
using all of that space to store small (< 4k or < 8k) files.

And it's also true that there are huge advantges to using a
multi-level directory hierarchy --- e.g.:

.git/objects/03/08e42105258d4e53ffeb81ffb2a4b2480bb8b8

or even

.git/objects/03/08/e42105258d4e53ffeb81ffb2a4b2480bb8b8

instead of:

.git/objects/0308e42105258d4e53ffeb81ffb2a4b2480bb8b8

but that's an application level question.  If for some reason some
silly application programmer wants to have a single gargantuan
directory, if the patches to support it are fairly simple, even if
someone is going to give us patches to do something more general,
burning an extra feature flag doesn't seem like the most terrible
thing in the world.

As for the other optimizations --- things like allowing parallel
directory modifications, or being able to shrink empty directory
blocks or shorten the tree are all improvements we can make without
impacting the on-disk format.  So they aren't an argument for halting
the submission of the new on-disk format, no?

    	       	      	  	  	  - Ted
Alexey Lyahkov March 18, 2017, 5:17 p.m. | #6
> 18 марта 2017 г., в 19:29, Theodore Ts'o <tytso@mit.edu> написал(а):
> 
> On Sat, Mar 18, 2017 at 11:16:26AM +0300, Alexey Lyashkov wrote:
>> Andreas,
>> 
>> it not about a feature flag. It’s about a situation in whole.
>> Yes, we may increase a directory size, but it open a number a large problems.
> 
>> 1) readdir. It tries to read all entries in memory before send to
>> the user. currently it may eats 20*10^6 * 256 so several gigs, so
>> increasing it size may produce a problems for a system.
> 
> That's not true.  We normally only read in one block a time.  If there
> is a hash collision, then we may need to insert into the rbtree in a
> subsequent block's worth of dentries to make sure we have all of the
> directory entries corresponding to a particular hash value.  I think
> you misunderstood the code.
As i see it not about hash collisions, but about merging a several blocks into same hash range on up level hash entry.
so if we have a large hash range originally assigned to the single block, all that range will read at memory at single step.
With «aged» directory when hash blocks used already - it’s easy to hit.


> 
>> 2) inode allocation. Current code tries to allocate an inode as near as possible to the directory inode, but one GD may hold 32k entries only, so increase a directory size will use up 1k GD for now and more than it, after it. It increase a seek time with file allocation. It was i mean when say - «dramatically decrease a file creation rate».
> 
> But there are also only 32k blocks in a group descriptor, and we try
> to keep the blocks allocated close to the inode.
with bigalloc feature it’s not a 32k blocks. but 32k clusters with 1M cluster size(as example), it very large space.


>  So if you are using
> a huge directory, and you are using a storage device with a
> significant seek penalty, then yes, no matter what as the directory
> grows, the time to iterate over all of the files does grow.  But there
> is more to life than microbenchmarks which creates huge numbers of zero
> length files!  If we assume that the files are going to need to
> contain _data_, and the data blocks should be close to the inodes,
> then there are going to be some performance impacts no matter what.
> 
Yes, i expect to have some seek penalty. But may testing say it too huge now.
directory creation rate started with 80k create/s have dropped to the 20k-30k create/s with hash tree extend to the level 3.
Same testing with hard links same create rate dropped slightly.


>> 3) current limit with 4G inodes - currently 32-128 directories may eats a full inode number space. From it perspective large dir don’t need to be used.
> 
> I can imagine a new feature flag which defines the use a 64-bit inode
> number, but that's more for people who are creating a file system that
> takes advantage of 64-bit block numbers, and they are intending on
> using all of that space to store small (< 4k or < 8k) files.
> 
> And it's also true that there are huge advantges to using a
> multi-level directory hierarchy --- e.g.:
> 
> .git/objects/03/08e42105258d4e53ffeb81ffb2a4b2480bb8b8
> 
> or even
> 
> .git/objects/03/08/e42105258d4e53ffeb81ffb2a4b2480bb8b8
> 
> instead of:
> 
> .git/objects/0308e42105258d4e53ffeb81ffb2a4b2480bb8b8
> 
> but that's an application level question.  If for some reason some
> silly application programmer wants to have a single gargantuan
> directory, if the patches to support it are fairly simple, even if
> someone is going to give us patches to do something more general,
> burning an extra feature flag doesn't seem like the most terrible
> thing in the world.
From other side - application don’t expect to have very slow directory and have access with some constant or near or it speed.



> 
> As for the other optimizations --- things like allowing parallel
> directory modifications, or being able to shrink empty directory
> blocks or shorten the tree are all improvements we can make without
> impacting the on-disk format.  So they aren't an argument for halting
> the submission of the new on-disk format, no?
> 
It’s argument about using this feature. Yes, we can land it, but it decrease an expected speed in some cases.


Alexey
Theodore Ts'o March 19, 2017, 12:39 a.m. | #7
On Sat, Mar 18, 2017 at 08:17:55PM +0300, Alexey Lyashkov wrote:
> > 
> > That's not true.  We normally only read in one block a time.  If there
> > is a hash collision, then we may need to insert into the rbtree in a
> > subsequent block's worth of dentries to make sure we have all of the
> > directory entries corresponding to a particular hash value.  I think
> > you misunderstood the code.
>
> As i see it not about hash collisions, but about merging a several
> blocks into same hash range on up level hash entry.  so if we have a
> large hash range originally assigned to the single block, all that
> range will read at memory at single step.  With «aged» directory
> when hash blocks used already - it’s easy to hit.

If you look at ext4_htree_fill_tree(), we are only iterating over the
leaf blocks.  We are using a 31-bit hash, where the low-order bit is
one if there has been a collision.  In that case, we need to read the
next block to make sure all of the directory entries which have the
same 31-bit hash are in the rbtree.

So *even* if the directory is so badly fragmeneted that there is only
one directory entry per block, we will eventually find a block where
the last (only) directory entry has a different directory entry from
the current hash value, at which point we can stop.

And I'll note that this is a problem that can be solved without
changing the on-disk format, but simply adding code so we can merge
adjacent directory entry blocks.  The main reason why it hasn't been
done is (a) no one has sent me patches, and (b) I haven't seen
workloads where this has been anything other than a theoretical /
academic concern.

You seem very passionate about this.  Is this a problem you've
personally seen?  If so, can you give me more details about your use
case, and how you've been running into this issue?  Instead of just
arguing about it from a largely theoretical perspective?

> Yes, i expect to have some seek penalty. But may testing say it too huge now.
> directory creation rate started with 80k create/s have dropped to the 20k-30k create/s with hash tree extend to the level 3.
> Same testing with hard links same create rate dropped slightly.

So this sounds like it's all about the seek penalty of the _data_
blocks.  If you use hard links the creation rate only dropped a
little, am I understanding you corretly?  (Sorry, your English is a
little fracturered so I'm having trouble parsing the meaning out of
your sentences.)

So what do you think the creation rate _should_ be?  And where do you
think the time is going to, if it's not the fact that we have to place
the data blocks further and further from the directory?  And more
importantly, what's your proposal for how to "fix" this?

> > As for the other optimizations --- things like allowing parallel
> > directory modifications, or being able to shrink empty directory
> > blocks or shorten the tree are all improvements we can make without
> > impacting the on-disk format.  So they aren't an argument for halting
> > the submission of the new on-disk format, no?
> > 
> It’s argument about using this feature. Yes, we can land it, but it decrease an expected speed in some cases.

But there are cases where today, the workload would simply fail with
ENOSPC when the directory couldn't grow any farther.  So in those
cases _maybe_ there is something we could do differently that might
make things faster, but you have yet to convince me that the
fundamental fault is one that can only be cured by an on-disk format
change.  (And if you believe this is to be true, please enlighten us
on how we can make the on-disk format better!)

Cheers,

					- Ted
Alexey Lyahkov March 19, 2017, 4:19 a.m. | #8
sorry for english..
> 19 марта 2017 г., в 3:39, Theodore Ts'o <tytso@mit.edu> написал(а):
> 
> On Sat, Mar 18, 2017 at 08:17:55PM +0300, Alexey Lyashkov wrote:
>>> 
>>> That's not true.  We normally only read in one block a time.  If there
>>> is a hash collision, then we may need to insert into the rbtree in a
>>> subsequent block's worth of dentries to make sure we have all of the
>>> directory entries corresponding to a particular hash value.  I think
>>> you misunderstood the code.
>> 
>> As i see it not about hash collisions, but about merging a several
>> blocks into same hash range on up level hash entry.  so if we have a
>> large hash range originally assigned to the single block, all that
>> range will read at memory at single step.  With «aged» directory
>> when hash blocks used already - it’s easy to hit.
> 
> If you look at ext4_htree_fill_tree(), we are only iterating over the
> leaf blocks.  We are using a 31-bit hash, where the low-order bit is
> one if there has been a collision.  In that case, we need to read the
> next block to make sure all of the directory entries which have the
> same 31-bit hash are in the rbtree.
looks we say about same but with different words.
based on dx code, up level hash block have a records - {hash1, block1} {hash2, block1}…
so any records with hash range {hash1, hash2} will live on block1.
right ? 
so question how much {hash1, hash2} distance may be, looks you name it as "hash collision".


> You seem very passionate about this.  Is this a problem you've
> personally seen?  If so, can you give me more details about your use
> case, and how you've been running into this issue?  Instead of just
> arguing about it from a largely theoretical perspective?
> 
It problem was seen with Lustre MDT code  after large number create/unlinks.
but it seen only few times.
Other hits isn’t conformed.



>> Yes, i expect to have some seek penalty. But may testing say it too huge now.
>> directory creation rate started with 80k create/s have dropped to the 20k-30k create/s with hash tree extend to the level 3.
>> Same testing with hard links same create rate dropped slightly.
> 
> So this sounds like it's all about the seek penalty of the _data_
> blocks.  
no.

> If you use hard links the creation rate only dropped a
> little, am I understanding you corretly?
yes and no. hard link create rate dropped a little, but open()+close tests dropped a large.
No writes, no data blocks, just inode allocation.



>  (Sorry, your English is a
> little fracturered so I'm having trouble parsing the meaning out of
> your sentences.)
it’s my bad :(
> 
> So what do you think the creation rate _should_ be?  And where do you
> think the time is going to, if it's not the fact that we have to place
> the data blocks further and further from the directory?  And more
> importantly, what's your proposal for how to "fix" this?
> 
>>> As for the other optimizations --- things like allowing parallel
>>> directory modifications, or being able to shrink empty directory
>>> blocks or shorten the tree are all improvements we can make without
>>> impacting the on-disk format.  So they aren't an argument for halting
>>> the submission of the new on-disk format, no?
>>> 
>> It’s argument about using this feature. Yes, we can land it, but it decrease an expected speed in some cases.
> 
> But there are cases where today, the workload would simply fail with
> ENOSPC when the directory couldn't grow any farther.  So in those
> cases _maybe_ there is something we could do differently that might
> make things faster, but you have yet to convince me that the
> fundamental fault is one that can only be cured by an on-disk format
> change.  (And if you believe this is to be true, please enlighten us
> on how we can make the on-disk format better!)
> 
> Cheers,
> 
> 					- Ted
Andreas Dilger March 19, 2017, 5:38 a.m. | #9
On Mar 18, 2017, at 10:29 AM, Theodore Ts'o <tytso@mit.edu> wrote:
> 
> On Sat, Mar 18, 2017 at 11:16:26AM +0300, Alexey Lyashkov wrote:
>> Andreas,
>> 
>> it not about a feature flag. It’s about a situation in whole.
>> Yes, we may increase a directory size, but it open a number a large problems.
> 
>> 1) readdir. It tries to read all entries in memory before send to
>> the user. currently it may eats 20*10^6 * 256 so several gigs, so
>> increasing it size may produce a problems for a system.
> 
> That's not true.  We normally only read in one block a time.  If there
> is a hash collision, then we may need to insert into the rbtree in a
> subsequent block's worth of dentries to make sure we have all of the
> directory entries corresponding to a particular hash value.  I think
> you misunderstood the code.
> 
>> 2) inode allocation. Current code tries to allocate an inode as near as possible to the directory inode, but one GD may hold 32k entries only, so increase a directory size will use up 1k GD for now and more than it, after it. It increase a seek time with file allocation. It was i mean when say - «dramatically decrease a file creation rate».
> 
> But there are also only 32k blocks in a group descriptor, and we try
> to keep the blocks allocated close to the inode.  So if you are using
> a huge directory, and you are using a storage device with a
> significant seek penalty, then yes, no matter what as the directory
> grows, the time to iterate over all of the files does grow.  But there
> is more to life than microbenchmarks which creates huge numbers of zero
> length files!  If we assume that the files are going to need to
> contain _data_, and the data blocks should be close to the inodes,
> then there are going to be some performance impacts no matter what.

Actually, on a Lustre MDT there _are_ only zero-length files, since all
of the data is stored in another filesystem.  Fortunately, the parent
directory stores the last group successfully used for allocation
(i_alloc_group) so that new inode allocation doesn't have to scan the
whole filesystem each time from the parent's group.

>> 3) current limit with 4G inodes - currently 32-128 directories may eats a full inode number space. From it perspective large dir don’t need to be used.
> 
> I can imagine a new feature flag which defines the use a 64-bit inode
> number, but that's more for people who are creating a file system that
> takes advantage of 64-bit block numbers, and they are intending on
> using all of that space to store small (< 4k or < 8k) files.

The 4-billion inode limit is somewhat independent of large directories.
That said, the DIRDATA feature that is used for Lustre is also designed
to allow storing the high 32 bits of the inode number in the directory.
This would allow compatible upgrade of a directory to storing both
32-bit and 64-bit inode numbers without the need for wholescale conversion
of directories, or having space for 64-bit inode numbers even if most
of the inodes are only 32-bit values.


Cheers, Andreas
Andreas Dilger March 19, 2017, 6:13 a.m. | #10
On Mar 18, 2017, at 6:39 PM, Theodore Ts'o <tytso@mit.edu> wrote:
> 
> On Sat, Mar 18, 2017 at 08:17:55PM +0300, Alexey Lyashkov wrote:
>>>> 1) readdir. It tries to read all entries in memory before send to
>>>> the user. currently it may eats 20*10^6 * 256 so several gigs, so
>>>> increasing it size may produce a problems for a system.
>>> 
>>> That's not true.  We normally only read in one block a time.  If there
>>> is a hash collision, then we may need to insert into the rbtree in a
>>> subsequent block's worth of dentries to make sure we have all of the
>>> directory entries corresponding to a particular hash value.  I think
>>> you misunderstood the code.
>> 
>> As i see it not about hash collisions, but about merging a several
>> blocks into same hash range on up level hash entry.  so if we have a
>> large hash range originally assigned to the single block, all that
>> range will read at memory at single step.  With «aged» directory
>> when hash blocks used already - it’s easy to hit.
> 
> If you look at ext4_htree_fill_tree(), we are only iterating over the
> leaf blocks.  We are using a 31-bit hash, where the low-order bit is
> one if there has been a collision.  In that case, we need to read the
> next block to make sure all of the directory entries which have the
> same 31-bit hash are in the rbtree.

I guess the main concern here is that insertion into an htree directory
is essentially random based on the hash, so for very large directories
the whole directory (htree and leaf blocks) needs to fit into RAM or the
performance will suffer since every leaf block will need to be read from
disk for every create/lookup/delete.

That said, the maximum size of a 2-level htree directory is about 1GB,
and servers have 128 or 256GB of RAM these days, so I don't think this
will be a huge problem.  We also have a tunable that can limit the
maximum directory size if this becomes a problem.

> And I'll note that this is a problem that can be solved without
> changing the on-disk format, but simply adding code so we can merge
> adjacent directory entry blocks.  The main reason why it hasn't been
> done is (a) no one has sent me patches, and (b) I haven't seen
> workloads where this has been anything other than a theoretical /
> academic concern.
> 
> You seem very passionate about this.  Is this a problem you've
> personally seen?  If so, can you give me more details about your use
> case, and how you've been running into this issue?  Instead of just
> arguing about it from a largely theoretical perspective?

We have seen large directories at the htree limit unable to add new
entries because the htree/leaf blocks become fragmented from repeated
create/delete cycles.  I agree that handling directory shrinking
would probably solve that problem, since the htree and leaf blocks
would be compacted during deletion and then the htree would be able
to split the leaf blocks in the right location for the new entries.

Cheers, Andreas

>> Yes, i expect to have some seek penalty. But may testing say it too huge
>> now.  directory creation rate started with 80k create/s have dropped
>> to the 20k-30k create/s with hash tree extend to the level 3.  Same
>> testing with hard links same create rate dropped slightly.
> 
> So this sounds like it's all about the seek penalty of the _data_
> blocks.  If you use hard links the creation rate only dropped a
> little, am I understanding you correctly?  (Sorry, your English is a
> little fracturered so I'm having trouble parsing the meaning out of
> your sentences.)
> 
> So what do you think the creation rate _should_ be?  And where do you
> think the time is going to, if it's not the fact that we have to place
> the data blocks further and further from the directory?  And more
> importantly, what's your proposal for how to "fix" this?
> 
>>> As for the other optimizations --- things like allowing parallel
>>> directory modifications, or being able to shrink empty directory
>>> blocks or shorten the tree are all improvements we can make without
>>> impacting the on-disk format.  So they aren't an argument for halting
>>> the submission of the new on-disk format, no?
>>> 
>> It’s argument about using this feature. Yes, we can land it, but it decrease an expected speed in some cases.
> 
> But there are cases where today, the workload would simply fail with
> ENOSPC when the directory couldn't grow any farther.  So in those
> cases _maybe_ there is something we could do differently that might
> make things faster, but you have yet to convince me that the
> fundamental fault is one that can only be cured by an on-disk format
> change.  (And if you believe this is to be true, please enlighten us
> on how we can make the on-disk format better!)
> 
> Cheers,
> 
> 					- Ted


Cheers, Andreas
Theodore Ts'o April 30, 2017, 12:59 a.m. | #11
On Thu, Mar 16, 2017 at 05:51:17AM -0400, Artem Blagodarenko wrote:
> From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
> 
> This INCOMPAT_LARGEDIR feature allows larger directories to be created
> in ldiskfs, both with directory sizes over 2GB and and a maximum htree
> depth of 3 instead of the current limit of 2. These features are needed
> in order to exceed the current limit of approximately 10M entries in a
> single directory.
> 
> Signed-off-by: Yang Sheng <yang.sheng@intel.com>
> Signed-off-by: Artem Blagodarenko <artem.blagodarenko@seagate.com>

Thanks, applied.

					- Ted
Eric Biggers May 1, 2017, 6:58 p.m. | #12
On Sat, Apr 29, 2017 at 08:59:53PM -0400, Theodore Ts'o wrote:
> On Thu, Mar 16, 2017 at 05:51:17AM -0400, Artem Blagodarenko wrote:
> > From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
> > 
> > This INCOMPAT_LARGEDIR feature allows larger directories to be created
> > in ldiskfs, both with directory sizes over 2GB and and a maximum htree
> > depth of 3 instead of the current limit of 2. These features are needed
> > in order to exceed the current limit of approximately 10M entries in a
> > single directory.
> > 
> > Signed-off-by: Yang Sheng <yang.sheng@intel.com>
> > Signed-off-by: Artem Blagodarenko <artem.blagodarenko@seagate.com>
> 
> Thanks, applied.
> 
> 					- Ted

FYI, this patch is causing a problem when creating many files in a directory
(without the largedir feature enabled).  I haven't looked into it but maybe it
will ring a bell for someone.

seq -f "abcdefghijklmnopqrstuvwxyz012345%.0f" 100000 | xargs touch

[   42.726480] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
[   42.729472] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
[   42.731689] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
[   42.734303] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
[   42.736383] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
[   42.739133] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
[   42.741307] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
[   42.743963] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
[   42.745998] EXT4-fs error (device vdc): dx_probe:840: inode #2: block 508: comm touch: Directory index failed checksum
...

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 01d52b9..0bbbd9b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1799,7 +1799,8 @@  static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 					 EXT4_FEATURE_INCOMPAT_MMP | \
 					 EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
 					 EXT4_FEATURE_INCOMPAT_ENCRYPT | \
-					 EXT4_FEATURE_INCOMPAT_CSUM_SEED)
+					 EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
+					 EXT4_FEATURE_INCOMPAT_LARGEDIR)
 #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
 					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -2125,6 +2126,16 @@  struct dir_private_info {
  */
 #define ERR_BAD_DX_DIR	(-(MAX_ERRNO - 1))
 
+/* htree levels for ext4 */
+#define	EXT4_HTREE_LEVEL_COMPAT	2
+#define	EXT4_HTREE_LEVEL	3
+
+static inline int ext4_dir_htree_level(struct super_block *sb)
+{
+	return ext4_has_feature_largedir(sb) ?
+		EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
+}
+
 /*
  * Timeout and state flag for lazy initialization inode thread.
  */
@@ -2758,13 +2769,15 @@  static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
 	es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
 }
 
-static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
+static inline loff_t ext4_isize(struct super_block *sb,
+				struct ext4_inode *raw_inode)
 {
-	if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
+	if (ext4_has_feature_largedir(sb) ||
+	    S_ISREG(le16_to_cpu(raw_inode->i_mode)))
 		return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
 			le32_to_cpu(raw_inode->i_size_lo);
-	else
-		return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
+
+	return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
 }
 
 static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f622d4a..5787f3d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4682,7 +4682,7 @@  struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	if (ext4_has_feature_64bit(sb))
 		ei->i_file_acl |=
 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
-	inode->i_size = ext4_isize(raw_inode);
+	inode->i_size = ext4_isize(sb, raw_inode);
 	if ((size = i_size_read(inode)) < 0) {
 		EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
 		ret = -EFSCORRUPTED;
@@ -5008,7 +5008,7 @@  static int ext4_do_update_inode(handle_t *handle,
 		raw_inode->i_file_acl_high =
 			cpu_to_le16(ei->i_file_acl >> 32);
 	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
-	if (ei->i_disksize != ext4_isize(raw_inode)) {
+	if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
 		ext4_isize_set(raw_inode, ei->i_disksize);
 		need_datasync = 1;
 	}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6ad612c..3298fe3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -513,7 +513,7 @@  static inline int ext4_handle_dirty_dx_node(handle_t *handle,
 
 static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
 {
-	return le32_to_cpu(entry->block) & 0x00ffffff;
+	return le32_to_cpu(entry->block) & 0x0fffffff;
 }
 
 static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
@@ -739,6 +739,7 @@  struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
 	struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
 	u32 hash;
 
+	memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
 	frame->bh = ext4_read_dirblock(dir, 0, INDEX);
 	if (IS_ERR(frame->bh))
 		return (struct dx_frame *) frame->bh;
@@ -768,9 +769,15 @@  struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
 	}
 
 	indirect = root->info.indirect_levels;
-	if (indirect > 1) {
-		ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
-				   root->info.indirect_levels);
+	if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
+		ext4_warning(dir->i_sb,
+			     "Directory (ino: %lu) htree depth %#06x exceed"
+			     "supported value", dir->i_ino,
+			     ext4_dir_htree_level(dir->i_sb));
+		if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
+			ext4_warning(dir->i_sb, "Enable large directory "
+						"feature to access it");
+		}
 		goto fail;
 	}
 
@@ -859,12 +866,19 @@  struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
 
 static void dx_release(struct dx_frame *frames)
 {
+	struct dx_root_info *info;
+	int i;
+
 	if (frames[0].bh == NULL)
 		return;
 
-	if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
-		brelse(frames[1].bh);
-	brelse(frames[0].bh);
+	info = &((struct dx_root *)frames[0].bh->b_data)->info;
+	for (i = 0; i <= info->indirect_levels; i++) {
+		if (frames[i].bh == NULL)
+			break;
+		brelse(frames[i].bh);
+		frames[i].bh = NULL;
+	}
 }
 
 /*
@@ -1050,7 +1064,7 @@  int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 {
 	struct dx_hash_info hinfo;
 	struct ext4_dir_entry_2 *de;
-	struct dx_frame frames[2], *frame;
+	struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
 	struct inode *dir;
 	ext4_lblk_t block;
 	int count = 0;
@@ -1517,7 +1531,7 @@  static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
 			struct ext4_dir_entry_2 **res_dir)
 {
 	struct super_block * sb = dir->i_sb;
-	struct dx_frame frames[2], *frame;
+	struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
 	const struct qstr *d_name = fname->usr_fname;
 	struct buffer_head *bh;
 	ext4_lblk_t block;
@@ -1947,7 +1961,7 @@  static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
 	 */
 	dir->i_mtime = dir->i_ctime = current_time(dir);
 	ext4_update_dx_flag(dir);
-	dir->i_version++;
+	inode_inc_iversion(dir);
 	ext4_mark_inode_dirty(handle, dir);
 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 	err = ext4_handle_dirty_dirent_node(handle, dir, bh);
@@ -1966,7 +1980,7 @@  static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
 {
 	struct buffer_head *bh2;
 	struct dx_root	*root;
-	struct dx_frame	frames[2], *frame;
+	struct dx_frame	frames[EXT4_HTREE_LEVEL], *frame;
 	struct dx_entry *entries;
 	struct ext4_dir_entry_2	*de, *de2;
 	struct ext4_dir_entry_tail *t;
@@ -2185,13 +2199,16 @@  static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
 			     struct inode *dir, struct inode *inode)
 {
-	struct dx_frame frames[2], *frame;
+	struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
 	struct dx_entry *entries, *at;
 	struct buffer_head *bh;
 	struct super_block *sb = dir->i_sb;
 	struct ext4_dir_entry_2 *de;
+	int restart;
 	int err;
 
+again:
+	restart = 0;
 	frame = dx_probe(fname, dir, NULL, frames);
 	if (IS_ERR(frame))
 		return PTR_ERR(frame);
@@ -2213,24 +2230,44 @@  static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
 	if (err != -ENOSPC)
 		goto cleanup;
 
+	err = 0;
 	/* Block full, should compress but for now just split */
 	dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
 		       dx_get_count(entries), dx_get_limit(entries)));
 	/* Need to split index? */
 	if (dx_get_count(entries) == dx_get_limit(entries)) {
 		ext4_lblk_t newblock;
-		unsigned icount = dx_get_count(entries);
-		int levels = frame - frames;
+		int levels = frame - frames + 1;
+		unsigned int icount;
+		int add_level = 1;
 		struct dx_entry *entries2;
 		struct dx_node *node2;
 		struct buffer_head *bh2;
 
-		if (levels && (dx_get_count(frames->entries) ==
-			       dx_get_limit(frames->entries))) {
-			ext4_warning_inode(dir, "Directory index full!");
+		while (frame > frames) {
+			if (dx_get_count((frame - 1)->entries) <
+			    dx_get_limit((frame - 1)->entries)) {
+				add_level = 0;
+				break;
+			}
+			frame--; /* split higher index block */
+			at = frame->at;
+			entries = frame->entries;
+			restart = 1;
+		}
+		if (add_level && levels == ext4_dir_htree_level(sb)) {
+			ext4_warning(sb, "Directory (ino: %lu) index full, "
+					 "reach max htree level :%d",
+					 dir->i_ino, levels);
+			if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
+				ext4_warning(sb, "Large directory feature is "
+						 "not enabled on this "
+						 "filesystem");
+			}
 			err = -ENOSPC;
 			goto cleanup;
 		}
+		icount = dx_get_count(entries);
 		bh2 = ext4_append(handle, dir, &newblock);
 		if (IS_ERR(bh2)) {
 			err = PTR_ERR(bh2);
@@ -2245,7 +2282,7 @@  static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
 		err = ext4_journal_get_write_access(handle, frame->bh);
 		if (err)
 			goto journal_error;
-		if (levels) {
+		if (!add_level) {
 			unsigned icount1 = icount/2, icount2 = icount - icount1;
 			unsigned hash2 = dx_get_hash(entries + icount1);
 			dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
@@ -2253,7 +2290,7 @@  static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
 
 			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
 			err = ext4_journal_get_write_access(handle,
-							     frames[0].bh);
+							     (frame - 1)->bh);
 			if (err)
 				goto journal_error;
 
@@ -2269,17 +2306,23 @@  static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
 				frame->entries = entries = entries2;
 				swap(frame->bh, bh2);
 			}
-			dx_insert_block(frames + 0, hash2, newblock);
-			dxtrace(dx_show_index("node", frames[1].entries));
+			dx_insert_block((frame - 1), hash2, newblock);
+			dxtrace(dx_show_index("node", frame->entries));
 			dxtrace(dx_show_index("node",
 			       ((struct dx_node *) bh2->b_data)->entries));
 			err = ext4_handle_dirty_dx_node(handle, dir, bh2);
 			if (err)
 				goto journal_error;
 			brelse (bh2);
+			ext4_handle_dirty_metadata(handle, dir,
+						   (frame - 1)->bh);
+			if (restart) {
+				ext4_handle_dirty_metadata(handle, dir,
+							   frame->bh);
+				goto cleanup;
+			}
 		} else {
-			dxtrace(printk(KERN_DEBUG
-				       "Creating second level index...\n"));
+			struct dx_root *dxroot;
 			memcpy((char *) entries2, (char *) entries,
 			       icount * sizeof(struct dx_entry));
 			dx_set_limit(entries2, dx_node_limit(dir));
@@ -2287,19 +2330,17 @@  static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
 			/* Set up root */
 			dx_set_count(entries, 1);
 			dx_set_block(entries + 0, newblock);
-			((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
-
-			/* Add new access path frame */
-			frame = frames + 1;
-			frame->at = at = at - entries + entries2;
-			frame->entries = entries = entries2;
-			frame->bh = bh2;
-			err = ext4_journal_get_write_access(handle,
-							     frame->bh);
-			if (err)
-				goto journal_error;
+			dxroot = (struct dx_root *)frames[0].bh->b_data;
+			dxroot->info.indirect_levels += 1;
+			dxtrace(printk(KERN_DEBUG
+				       "Creating %d level index...\n",
+				       info->indirect_levels));
+			ext4_handle_dirty_metadata(handle, dir, frame->bh);
+			ext4_handle_dirty_metadata(handle, dir, bh2);
+			brelse(bh2);
+			restart = 1;
+			goto cleanup;
 		}
-		err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
 		if (err) {
 			ext4_std_error(inode->i_sb, err);
 			goto cleanup;
@@ -2318,6 +2359,11 @@  static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
 cleanup:
 	brelse(bh);
 	dx_release(frames);
+	/* @restart is true means htree-path has been changed, we need to
+	 * repeat dx_probe() to find out valid htree-path
+	 */
+	if (restart && err == 0)
+		goto again;
 	return err;
 }
 
@@ -2354,7 +2400,7 @@  int ext4_generic_delete_entry(handle_t *handle,
 					blocksize);
 			else
 				de->inode = 0;
-			dir->i_version++;
+			inode_inc_iversion(dir);
 			return 0;
 		}
 		i += ext4_rec_len_from_disk(de->rec_len, blocksize);