diff mbox series

[4/4] ext4: add proc files to monitor new structures

Message ID 20210129222931.623008-5-harshadshirwadkar@gmail.com
State New
Headers show
Series Improve group scanning in CR 0 and CR 1 passes | expand

Commit Message

harshad shirwadkar Jan. 29, 2021, 10:29 p.m. UTC
This patch adds a new file "mb_structs" which allows us to see the
largest free order lists and the serialized average fragment tree.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
 fs/ext4/ext4.h    |  1 +
 fs/ext4/mballoc.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/sysfs.c   |  2 ++
 3 files changed, 82 insertions(+)

Comments

Andreas Dilger Jan. 30, 2021, 10:58 a.m. UTC | #1
On Jan 29, 2021, at 3:29 PM, Harshad Shirwadkar <harshadshirwadkar@gmail.com> wrote:
> 
> This patch adds a new file "mb_structs" which allows us to see the
> largest free order lists and the serialized average fragment tree.

It might make sense to split these two structs into separate files.
On large filesystems, there will be millions of groups, so having
to dump both structs to find out information about one or the other
would be a lot of overhead.

Also, while the "mb_groups" file can be large (one line per group)
for millions of groups, the files added here may have millions of
groups on the same line.  Text processing tools are used to dealing
with many lines in a file, but are relatively poor at dealing with
millions of characters on a single line...

It would be good to have a "summary" file that dumps general info
about these structs (e.g. the number of groups at each list order,
depth of the rbtree).  That could be maintained easily for the c0
lists at least, since the list is locked whenever a group is added
and removed.

In Artem's patch to improve mballoc for large filesystems (which
didn't land, but was useful anyway), there was an "mb_alloc_stats"
file added:

https://github.com/lustre/lustre-release/blob/master/ldiskfs/kernel_patches/patches/rhel8/ext4-simple-blockalloc.patch#L102

/proc/fs/ldiskfs/dm-2/mb_alloc_stats:mballoc:
/proc/fs/ldiskfs/dm-2/mb_alloc_stats:   blocks: 0
/proc/fs/ldiskfs/dm-2/mb_alloc_stats:   reqs: 0
/proc/fs/ldiskfs/dm-2/mb_alloc_stats:   success: 0
/proc/fs/ldiskfs/dm-2/mb_alloc_stats:   extents_scanned: 0
/proc/fs/ldiskfs/dm-2/mb_alloc_stats:           goal_hits: 0
/proc/fs/ldiskfs/dm-2/mb_alloc_stats:           2^n_hits: 0
/proc/fs/ldiskfs/dm-2/mb_alloc_stats:           breaks: 0
/proc/fs/ldiskfs/dm-2/mb_alloc_stats:           lost: 0
/proc/fs/ldiskfs/dm-2/mb_alloc_stats:   useless_c1_loops: 0
/proc/fs/ldiskfs/dm-2/mb_alloc_stats:   useless_c2_loops: 0
/proc/fs/ldiskfs/dm-2/mb_alloc_stats:   useless_c3_loops: 163
/proc/fs/ldiskfs/dm-2/mb_alloc_stats:   skipped_c1_loops: 0
/proc/fs/ldiskfs/dm-2/mb_alloc_stats:   skipped_c2_loops: 1230
/proc/fs/ldiskfs/dm-2/mb_alloc_stats:   skipped_c3_loops: 0
/proc/fs/ldiskfs/dm-2/mb_alloc_stats:   buddies_generated: 6305
/proc/fs/ldiskfs/dm-2/mb_alloc_stats:   buddies_time_used: 20165523
/proc/fs/ldiskfs/dm-2/mb_alloc_stats:   preallocated: 9943702
/proc/fs/ldiskfs/dm-2/mb_alloc_stats:   discarded: 10506

that would still be useful to maintain, since only a few of the lines
are specific to his patch ({useless,skipped}_c[123]_loops)

> Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
> ---
> fs/ext4/ext4.h    |  1 +
> fs/ext4/mballoc.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++
> fs/ext4/sysfs.c   |  2 ++
> 3 files changed, 82 insertions(+)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index da12a083bf52..835e304e3113 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2809,6 +2809,7 @@ int __init ext4_fc_init_dentry_cache(void);
> 
> /* mballoc.c */
> extern const struct seq_operations ext4_mb_seq_groups_ops;
> +extern const struct seq_operations ext4_mb_seq_structs_ops;
> extern long ext4_mb_stats;
> extern long ext4_mb_max_to_scan;
> extern int ext4_mb_init(struct super_block *);
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 413259477b03..ec4656903bd4 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2741,6 +2741,85 @@ const struct seq_operations ext4_mb_seq_groups_ops = {
> 	.show   = ext4_mb_seq_groups_show,
> };
> 
> +static void *ext4_mb_seq_structs_start(struct seq_file *seq, loff_t *pos)
> +{
> +	struct super_block *sb = PDE_DATA(file_inode(seq->file));
> +	unsigned long position;
> +
> +	read_lock(&EXT4_SB(sb)->s_mb_rb_lock);
> +
> +	if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + ext4_get_groups_count(sb))
> +		return NULL;
> +	position = *pos + 1;
> +	return (void *) ((unsigned long) position);
> +}
> +
> +static void *ext4_mb_seq_structs_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> +	struct super_block *sb = PDE_DATA(file_inode(seq->file));
> +	unsigned long position;
> +
> +	++*pos;
> +	if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + ext4_get_groups_count(sb))
> +		return NULL;
> +	position = *pos + 1;
> +	return (void *) ((unsigned long) position);
> +}
> +
> +static int ext4_mb_seq_structs_show(struct seq_file *seq, void *v)
> +{
> +	struct super_block *sb = PDE_DATA(file_inode(seq->file));
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	unsigned long position = ((unsigned long) v);
> +	struct ext4_group_info *grpinfo;
> +	struct rb_node *n;
> +	int i;
> +
> +	position--;
> +
> +	if (position >= MB_NUM_ORDERS(sb)) {
> +		position -= MB_NUM_ORDERS(sb);
> +		if (position == 0)
> +			seq_puts(seq, "Group, Avg Fragment Size\n");
> +		n = rb_first(&sbi->s_mb_avg_fragment_size_root);
> +		for (i = 0; n && i < position; i++)
> +			n = rb_next(n);
> +		if (!n)
> +			return 0;
> +		grpinfo = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
> +		seq_printf(seq, "%d, %d\n",
> +			   grpinfo->bb_group,
> +			   grpinfo->bb_fragments ? grpinfo->bb_free / grpinfo->bb_fragments : 0);
> +		return 0;
> +	}
> +
> +	if (position == 0)
> +		seq_puts(seq, "Largest Free Order Lists:\n");
> +
> +	seq_printf(seq, "[%ld]: ", position);
> +	list_for_each_entry(grpinfo, &sbi->s_mb_largest_free_orders[position],
> +			    bb_largest_free_order_node)	{
> +		seq_printf(seq, "%d ", grpinfo->bb_group);
> +	}
> +	seq_puts(seq, "\n");
> +
> +	return 0;
> +}
> +
> +static void ext4_mb_seq_structs_stop(struct seq_file *seq, void *v)
> +{
> +	struct super_block *sb = PDE_DATA(file_inode(seq->file));
> +
> +	read_unlock(&EXT4_SB(sb)->s_mb_rb_lock);
> +}
> +
> +const struct seq_operations ext4_mb_seq_structs_ops = {
> +	.start  = ext4_mb_seq_structs_start,
> +	.next   = ext4_mb_seq_structs_next,
> +	.stop   = ext4_mb_seq_structs_stop,
> +	.show   = ext4_mb_seq_structs_show,
> +};
> +
> static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
> {
> 	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index 4e27fe6ed3ae..828d58b98310 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -527,6 +527,8 @@ int ext4_register_sysfs(struct super_block *sb)
> 					ext4_fc_info_show, sb);
> 		proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc,
> 				&ext4_mb_seq_groups_ops, sb);
> +		proc_create_seq_data("mb_structs", 0444, sbi->s_proc,
> +				&ext4_mb_seq_structs_ops, sb);
> 	}
> 	return 0;
> }
> --
> 2.30.0.365.g02bc693789-goog
> 


Cheers, Andreas
harshad shirwadkar Feb. 1, 2021, 11:37 p.m. UTC | #2
I see. That makes sense. Also as you mentioned it would be good to
have a summary file that shows a summary of the structures. In fact,
looking at the contents of mb_groups and if we add a summary file, I
think we probably should just get rid of these verbose files.
mb_groups has information about largest free order and fragment size,
using which, we can reconstruct these data structures and we can then
verify the correctness using the summary file.

On Sat, Jan 30, 2021 at 2:58 AM Andreas Dilger <adilger@dilger.ca> wrote:
>
> On Jan 29, 2021, at 3:29 PM, Harshad Shirwadkar <harshadshirwadkar@gmail.com> wrote:
> >
> > This patch adds a new file "mb_structs" which allows us to see the
> > largest free order lists and the serialized average fragment tree.
>
> It might make sense to split these two structs into separate files.
> On large filesystems, there will be millions of groups, so having
> to dump both structs to find out information about one or the other
> would be a lot of overhead.
>
> Also, while the "mb_groups" file can be large (one line per group)
> for millions of groups, the files added here may have millions of
> groups on the same line.  Text processing tools are used to dealing
> with many lines in a file, but are relatively poor at dealing with
> millions of characters on a single line...
>
> It would be good to have a "summary" file that dumps general info
> about these structs (e.g. the number of groups at each list order,
> depth of the rbtree).  That could be maintained easily for the c0
> lists at least, since the list is locked whenever a group is added
> and removed.
>
> In Artem's patch to improve mballoc for large filesystems (which
> didn't land, but was useful anyway), there was an "mb_alloc_stats"
> file added:
>
> https://github.com/lustre/lustre-release/blob/master/ldiskfs/kernel_patches/patches/rhel8/ext4-simple-blockalloc.patch#L102
>
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:mballoc:
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:   blocks: 0
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:   reqs: 0
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:   success: 0
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:   extents_scanned: 0
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:           goal_hits: 0
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:           2^n_hits: 0
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:           breaks: 0
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:           lost: 0
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:   useless_c1_loops: 0
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:   useless_c2_loops: 0
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:   useless_c3_loops: 163
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:   skipped_c1_loops: 0
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:   skipped_c2_loops: 1230
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:   skipped_c3_loops: 0
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:   buddies_generated: 6305
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:   buddies_time_used: 20165523
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:   preallocated: 9943702
> /proc/fs/ldiskfs/dm-2/mb_alloc_stats:   discarded: 10506
>
> that would still be useful to maintain, since only a few of the lines
> are specific to his patch ({useless,skipped}_c[123]_loops)
I agree, this is very useful to have! it gives us a way to benchmark
efficiency of mballoc. I'll add mb_alloc_stats here.

Thanks,
Harshad
>
> > Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
> > ---
> > fs/ext4/ext4.h    |  1 +
> > fs/ext4/mballoc.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++
> > fs/ext4/sysfs.c   |  2 ++
> > 3 files changed, 82 insertions(+)
> >
> > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > index da12a083bf52..835e304e3113 100644
> > --- a/fs/ext4/ext4.h
> > +++ b/fs/ext4/ext4.h
> > @@ -2809,6 +2809,7 @@ int __init ext4_fc_init_dentry_cache(void);
> >
> > /* mballoc.c */
> > extern const struct seq_operations ext4_mb_seq_groups_ops;
> > +extern const struct seq_operations ext4_mb_seq_structs_ops;
> > extern long ext4_mb_stats;
> > extern long ext4_mb_max_to_scan;
> > extern int ext4_mb_init(struct super_block *);
> > diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> > index 413259477b03..ec4656903bd4 100644
> > --- a/fs/ext4/mballoc.c
> > +++ b/fs/ext4/mballoc.c
> > @@ -2741,6 +2741,85 @@ const struct seq_operations ext4_mb_seq_groups_ops = {
> >       .show   = ext4_mb_seq_groups_show,
> > };
> >
> > +static void *ext4_mb_seq_structs_start(struct seq_file *seq, loff_t *pos)
> > +{
> > +     struct super_block *sb = PDE_DATA(file_inode(seq->file));
> > +     unsigned long position;
> > +
> > +     read_lock(&EXT4_SB(sb)->s_mb_rb_lock);
> > +
> > +     if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + ext4_get_groups_count(sb))
> > +             return NULL;
> > +     position = *pos + 1;
> > +     return (void *) ((unsigned long) position);
> > +}
> > +
> > +static void *ext4_mb_seq_structs_next(struct seq_file *seq, void *v, loff_t *pos)
> > +{
> > +     struct super_block *sb = PDE_DATA(file_inode(seq->file));
> > +     unsigned long position;
> > +
> > +     ++*pos;
> > +     if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + ext4_get_groups_count(sb))
> > +             return NULL;
> > +     position = *pos + 1;
> > +     return (void *) ((unsigned long) position);
> > +}
> > +
> > +static int ext4_mb_seq_structs_show(struct seq_file *seq, void *v)
> > +{
> > +     struct super_block *sb = PDE_DATA(file_inode(seq->file));
> > +     struct ext4_sb_info *sbi = EXT4_SB(sb);
> > +     unsigned long position = ((unsigned long) v);
> > +     struct ext4_group_info *grpinfo;
> > +     struct rb_node *n;
> > +     int i;
> > +
> > +     position--;
> > +
> > +     if (position >= MB_NUM_ORDERS(sb)) {
> > +             position -= MB_NUM_ORDERS(sb);
> > +             if (position == 0)
> > +                     seq_puts(seq, "Group, Avg Fragment Size\n");
> > +             n = rb_first(&sbi->s_mb_avg_fragment_size_root);
> > +             for (i = 0; n && i < position; i++)
> > +                     n = rb_next(n);
> > +             if (!n)
> > +                     return 0;
> > +             grpinfo = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
> > +             seq_printf(seq, "%d, %d\n",
> > +                        grpinfo->bb_group,
> > +                        grpinfo->bb_fragments ? grpinfo->bb_free / grpinfo->bb_fragments : 0);
> > +             return 0;
> > +     }
> > +
> > +     if (position == 0)
> > +             seq_puts(seq, "Largest Free Order Lists:\n");
> > +
> > +     seq_printf(seq, "[%ld]: ", position);
> > +     list_for_each_entry(grpinfo, &sbi->s_mb_largest_free_orders[position],
> > +                         bb_largest_free_order_node) {
> > +             seq_printf(seq, "%d ", grpinfo->bb_group);
> > +     }
> > +     seq_puts(seq, "\n");
> > +
> > +     return 0;
> > +}
> > +
> > +static void ext4_mb_seq_structs_stop(struct seq_file *seq, void *v)
> > +{
> > +     struct super_block *sb = PDE_DATA(file_inode(seq->file));
> > +
> > +     read_unlock(&EXT4_SB(sb)->s_mb_rb_lock);
> > +}
> > +
> > +const struct seq_operations ext4_mb_seq_structs_ops = {
> > +     .start  = ext4_mb_seq_structs_start,
> > +     .next   = ext4_mb_seq_structs_next,
> > +     .stop   = ext4_mb_seq_structs_stop,
> > +     .show   = ext4_mb_seq_structs_show,
> > +};
> > +
> > static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
> > {
> >       int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
> > diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> > index 4e27fe6ed3ae..828d58b98310 100644
> > --- a/fs/ext4/sysfs.c
> > +++ b/fs/ext4/sysfs.c
> > @@ -527,6 +527,8 @@ int ext4_register_sysfs(struct super_block *sb)
> >                                       ext4_fc_info_show, sb);
> >               proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc,
> >                               &ext4_mb_seq_groups_ops, sb);
> > +             proc_create_seq_data("mb_structs", 0444, sbi->s_proc,
> > +                             &ext4_mb_seq_structs_ops, sb);
> >       }
> >       return 0;
> > }
> > --
> > 2.30.0.365.g02bc693789-goog
> >
>
>
> Cheers, Andreas
>
>
>
>
>
diff mbox series

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index da12a083bf52..835e304e3113 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2809,6 +2809,7 @@  int __init ext4_fc_init_dentry_cache(void);
 
 /* mballoc.c */
 extern const struct seq_operations ext4_mb_seq_groups_ops;
+extern const struct seq_operations ext4_mb_seq_structs_ops;
 extern long ext4_mb_stats;
 extern long ext4_mb_max_to_scan;
 extern int ext4_mb_init(struct super_block *);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 413259477b03..ec4656903bd4 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2741,6 +2741,85 @@  const struct seq_operations ext4_mb_seq_groups_ops = {
 	.show   = ext4_mb_seq_groups_show,
 };
 
+static void *ext4_mb_seq_structs_start(struct seq_file *seq, loff_t *pos)
+{
+	struct super_block *sb = PDE_DATA(file_inode(seq->file));
+	unsigned long position;
+
+	read_lock(&EXT4_SB(sb)->s_mb_rb_lock);
+
+	if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + ext4_get_groups_count(sb))
+		return NULL;
+	position = *pos + 1;
+	return (void *) ((unsigned long) position);
+}
+
+static void *ext4_mb_seq_structs_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct super_block *sb = PDE_DATA(file_inode(seq->file));
+	unsigned long position;
+
+	++*pos;
+	if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + ext4_get_groups_count(sb))
+		return NULL;
+	position = *pos + 1;
+	return (void *) ((unsigned long) position);
+}
+
+static int ext4_mb_seq_structs_show(struct seq_file *seq, void *v)
+{
+	struct super_block *sb = PDE_DATA(file_inode(seq->file));
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	unsigned long position = ((unsigned long) v);
+	struct ext4_group_info *grpinfo;
+	struct rb_node *n;
+	int i;
+
+	position--;
+
+	if (position >= MB_NUM_ORDERS(sb)) {
+		position -= MB_NUM_ORDERS(sb);
+		if (position == 0)
+			seq_puts(seq, "Group, Avg Fragment Size\n");
+		n = rb_first(&sbi->s_mb_avg_fragment_size_root);
+		for (i = 0; n && i < position; i++)
+			n = rb_next(n);
+		if (!n)
+			return 0;
+		grpinfo = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
+		seq_printf(seq, "%d, %d\n",
+			   grpinfo->bb_group,
+			   grpinfo->bb_fragments ? grpinfo->bb_free / grpinfo->bb_fragments : 0);
+		return 0;
+	}
+
+	if (position == 0)
+		seq_puts(seq, "Largest Free Order Lists:\n");
+
+	seq_printf(seq, "[%ld]: ", position);
+	list_for_each_entry(grpinfo, &sbi->s_mb_largest_free_orders[position],
+			    bb_largest_free_order_node)	{
+		seq_printf(seq, "%d ", grpinfo->bb_group);
+	}
+	seq_puts(seq, "\n");
+
+	return 0;
+}
+
+static void ext4_mb_seq_structs_stop(struct seq_file *seq, void *v)
+{
+	struct super_block *sb = PDE_DATA(file_inode(seq->file));
+
+	read_unlock(&EXT4_SB(sb)->s_mb_rb_lock);
+}
+
+const struct seq_operations ext4_mb_seq_structs_ops = {
+	.start  = ext4_mb_seq_structs_start,
+	.next   = ext4_mb_seq_structs_next,
+	.stop   = ext4_mb_seq_structs_stop,
+	.show   = ext4_mb_seq_structs_show,
+};
+
 static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
 {
 	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 4e27fe6ed3ae..828d58b98310 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -527,6 +527,8 @@  int ext4_register_sysfs(struct super_block *sb)
 					ext4_fc_info_show, sb);
 		proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc,
 				&ext4_mb_seq_groups_ops, sb);
+		proc_create_seq_data("mb_structs", 0444, sbi->s_proc,
+				&ext4_mb_seq_structs_ops, sb);
 	}
 	return 0;
 }