diff mbox series

[RFC,v3,8/8] ext4: Remove the logic to trim inode PAs

Message ID a26fdd12f4f60cf506a42b6a95e8014e5f380b05.1664269665.git.ojaswin@linux.ibm.com
State Superseded
Headers show
Series ext4: Convert inode preallocation list to an rbtree | expand

Commit Message

Ojaswin Mujoo Sept. 27, 2022, 9:16 a.m. UTC
Earlier, inode PAs were stored in a linked list. This caused a need to
periodically trim the list down inorder to avoid growing it to a very
large size, as this would severly affect performance during list
iteration.

Recent patches changed this list to an rbtree, and since the tree scales
up much better, we no longer need to have the trim functionality, hence
remove it.

Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
---
 Documentation/admin-guide/ext4.rst |  3 ---
 fs/ext4/ext4.h                     |  1 -
 fs/ext4/mballoc.c                  | 20 --------------------
 fs/ext4/mballoc.h                  |  5 -----
 fs/ext4/sysfs.c                    |  2 --
 5 files changed, 31 deletions(-)

Comments

Jan Kara Sept. 29, 2022, 12:53 p.m. UTC | #1
On Tue 27-09-22 14:46:48, Ojaswin Mujoo wrote:
> Earlier, inode PAs were stored in a linked list. This caused a need to
> periodically trim the list down inorder to avoid growing it to a very
> large size, as this would severly affect performance during list
> iteration.
> 
> Recent patches changed this list to an rbtree, and since the tree scales
> up much better, we no longer need to have the trim functionality, hence
> remove it.
> 
> Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>

I'm kind of wondering: Now there won't be performance issues with much
more inode PAs but probably we don't want to let them grow completely out
of control? E.g. I can imagine that if we'd have 1 billion of inode PAs
attached to an inode, things would get wonky both in terms of memory
consumption and also in terms of CPU time spent for the cases where we
still do iterate all of the PAs... Is there anything which keeps inode PAs
reasonably bounded?

								Honza

> ---
>  Documentation/admin-guide/ext4.rst |  3 ---
>  fs/ext4/ext4.h                     |  1 -
>  fs/ext4/mballoc.c                  | 20 --------------------
>  fs/ext4/mballoc.h                  |  5 -----
>  fs/ext4/sysfs.c                    |  2 --
>  5 files changed, 31 deletions(-)
> 
> diff --git a/Documentation/admin-guide/ext4.rst b/Documentation/admin-guide/ext4.rst
> index 4c559e08d11e..5740d85439ff 100644
> --- a/Documentation/admin-guide/ext4.rst
> +++ b/Documentation/admin-guide/ext4.rst
> @@ -489,9 +489,6 @@ Files in /sys/fs/ext4/<devname>:
>          multiple of this tuning parameter if the stripe size is not set in the
>          ext4 superblock
>  
> -  mb_max_inode_prealloc
> -        The maximum length of per-inode ext4_prealloc_space list.
> -
>    mb_max_to_scan
>          The maximum number of extents the multiblock allocator will search to
>          find the best extent.
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index d54b972f1f0f..bca4b41cc192 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1612,7 +1612,6 @@ struct ext4_sb_info {
>  	unsigned int s_mb_stats;
>  	unsigned int s_mb_order2_reqs;
>  	unsigned int s_mb_group_prealloc;
> -	unsigned int s_mb_max_inode_prealloc;
>  	unsigned int s_max_dir_size_kb;
>  	/* where last allocation was done - for stream allocation */
>  	unsigned long s_mb_last_group;
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index cd19b9e84767..57e1ec88477a 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -3420,7 +3420,6 @@ int ext4_mb_init(struct super_block *sb)
>  	sbi->s_mb_stats = MB_DEFAULT_STATS;
>  	sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
>  	sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
> -	sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
>  	/*
>  	 * The default group preallocation is 512, which for 4k block
>  	 * sizes translates to 2 megabytes.  However for bigalloc file
> @@ -5546,29 +5545,11 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
>  	return ;
>  }
>  
> -/*
> - * if per-inode prealloc list is too long, trim some PA
> - */
> -static void ext4_mb_trim_inode_pa(struct inode *inode)
> -{
> -	struct ext4_inode_info *ei = EXT4_I(inode);
> -	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> -	int count, delta;
> -
> -	count = atomic_read(&ei->i_prealloc_active);
> -	delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
> -	if (count > sbi->s_mb_max_inode_prealloc + delta) {
> -		count -= sbi->s_mb_max_inode_prealloc;
> -		ext4_discard_preallocations(inode, count);
> -	}
> -}
> -
>  /*
>   * release all resource we used in allocation
>   */
>  static int ext4_mb_release_context(struct ext4_allocation_context *ac)
>  {
> -	struct inode *inode = ac->ac_inode;
>  	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
>  	struct ext4_prealloc_space *pa = ac->ac_pa;
>  	if (pa) {
> @@ -5604,7 +5585,6 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
>  	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
>  		mutex_unlock(&ac->ac_lg->lg_mutex);
>  	ext4_mb_collect_stats(ac);
> -	ext4_mb_trim_inode_pa(inode);
>  	return 0;
>  }
>  
> diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
> index f8e8ee493867..6d85ee8674a6 100644
> --- a/fs/ext4/mballoc.h
> +++ b/fs/ext4/mballoc.h
> @@ -73,11 +73,6 @@
>   */
>  #define MB_DEFAULT_GROUP_PREALLOC	512
>  
> -/*
> - * maximum length of inode prealloc list
> - */
> -#define MB_DEFAULT_MAX_INODE_PREALLOC	512
> -
>  /*
>   * Number of groups to search linearly before performing group scanning
>   * optimization.
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index d233c24ea342..f0d42cf44c71 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -214,7 +214,6 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
>  EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
>  EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
>  EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
> -EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc);
>  EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups);
>  EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
>  EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
> @@ -264,7 +263,6 @@ static struct attribute *ext4_attrs[] = {
>  	ATTR_LIST(mb_order2_req),
>  	ATTR_LIST(mb_stream_req),
>  	ATTR_LIST(mb_group_prealloc),
> -	ATTR_LIST(mb_max_inode_prealloc),
>  	ATTR_LIST(mb_max_linear_groups),
>  	ATTR_LIST(max_writeback_mb_bump),
>  	ATTR_LIST(extent_max_zeroout_kb),
> -- 
> 2.31.1
>
Ojaswin Mujoo Oct. 6, 2022, 6:55 a.m. UTC | #2
On Thu, Sep 29, 2022 at 02:53:11PM +0200, Jan Kara wrote:
> On Tue 27-09-22 14:46:48, Ojaswin Mujoo wrote:
> > Earlier, inode PAs were stored in a linked list. This caused a need to
> > periodically trim the list down inorder to avoid growing it to a very
> > large size, as this would severly affect performance during list
> > iteration.
> > 
> > Recent patches changed this list to an rbtree, and since the tree scales
> > up much better, we no longer need to have the trim functionality, hence
> > remove it.
> > 
> > Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> > Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> 
> I'm kind of wondering: Now there won't be performance issues with much
> more inode PAs but probably we don't want to let them grow completely out
> of control? E.g. I can imagine that if we'd have 1 billion of inode PAs
> attached to an inode, things would get wonky both in terms of memory
> consumption and also in terms of CPU time spent for the cases where we
> still do iterate all of the PAs... Is there anything which keeps inode PAs
> reasonably bounded?
> 
> 								Honza
> 
Hi Jan,

Sorry for the delay in response, I was on leave for the last few days.

So as per my understanding, after this patch, the only path where we
would need to traverse all the PAs is the ext4_discard_preallocations()
call where we discard all the PAs of an inode one by one (eg when
closing the file etc).  Such a discard is a colder path as we don't
usually expect to do it as often as say allocating blocks to an inode.

Originally, the limit was added in this patch [1] because of the time
lost in O(N) traversal in the allocation path (ext4_mb_use_preallocated
and ext4_mb_normalize_request). Since the rbtree addressed this
scalability issue we had decided to remove the trim logic in this
patchset.

[1]
https://lore.kernel.org/all/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com/

That being said, I do agree that there should be some way to limit the
PAs from taking up an unreasonable amount of buddy space, memory and CPU
cycles in use cases like database files and disk files of long running
VMs. Previously the limit was 512 PAs per inode and trim was happening
in an LRU fashion, which is not very straightforward to implement in
trees. 

Another approach is rather than having a hard limit, we can throttle the
PAs based on some parameter like total active PAs in FS or FSUtil% of
the PAs but we might need to take care of fairness so one inode is not
holding all the PAs while others get throttled.

Anyways, I think the trimming part would need some brainstorming to get
right so just wondering if we could keep that as part of a separate
patchset and remove the trimming logic for now since rbtree has
addressed the scalability concerns in allocation path.

Do let me know your thoughts on this.

Regards,
Ojaswin
Jan Kara Oct. 6, 2022, 8:59 a.m. UTC | #3
On Thu 06-10-22 12:25:00, Ojaswin Mujoo wrote:
> On Thu, Sep 29, 2022 at 02:53:11PM +0200, Jan Kara wrote:
> > On Tue 27-09-22 14:46:48, Ojaswin Mujoo wrote:
> > > Earlier, inode PAs were stored in a linked list. This caused a need to
> > > periodically trim the list down inorder to avoid growing it to a very
> > > large size, as this would severly affect performance during list
> > > iteration.
> > > 
> > > Recent patches changed this list to an rbtree, and since the tree scales
> > > up much better, we no longer need to have the trim functionality, hence
> > > remove it.
> > > 
> > > Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> > > Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> > 
> > I'm kind of wondering: Now there won't be performance issues with much
> > more inode PAs but probably we don't want to let them grow completely out
> > of control? E.g. I can imagine that if we'd have 1 billion of inode PAs
> > attached to an inode, things would get wonky both in terms of memory
> > consumption and also in terms of CPU time spent for the cases where we
> > still do iterate all of the PAs... Is there anything which keeps inode PAs
> > reasonably bounded?
> > 
> > 								Honza
> > 
> Hi Jan,
> 
> Sorry for the delay in response, I was on leave for the last few days.
> 
> So as per my understanding, after this patch, the only path where we
> would need to traverse all the PAs is the ext4_discard_preallocations()
> call where we discard all the PAs of an inode one by one (eg when
> closing the file etc).  Such a discard is a colder path as we don't
> usually expect to do it as often as say allocating blocks to an inode.
> 
> Originally, the limit was added in this patch [1] because of the time
> lost in O(N) traversal in the allocation path (ext4_mb_use_preallocated
> and ext4_mb_normalize_request). Since the rbtree addressed this
> scalability issue we had decided to remove the trim logic in this
> patchset.
> 
> [1]
> https://lore.kernel.org/all/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com/

I agree the O(N) traversal is not in any performance sensitive path.

> That being said, I do agree that there should be some way to limit the
> PAs from taking up an unreasonable amount of buddy space, memory and CPU
> cycles in use cases like database files and disk files of long running
> VMs. Previously the limit was 512 PAs per inode and trim was happening
> in an LRU fashion, which is not very straightforward to implement in
> trees. 
> 
> Another approach is rather than having a hard limit, we can throttle the
> PAs based on some parameter like total active PAs in FS or FSUtil% of
> the PAs but we might need to take care of fairness so one inode is not
> holding all the PAs while others get throttled.
> 
> Anyways, I think the trimming part would need some brainstorming to get
> right so just wondering if we could keep that as part of a separate
> patchset and remove the trimming logic for now since rbtree has
> addressed the scalability concerns in allocation path.

I agree the fact it took until 2020 for someone to notice inode PAs can
be cumulating enough for full scan to matter on block allocation means that
this is not a pressing issue. So I'm OK postponing it for now since I also
don't have a great idea how to best trim excessive preallocations.

								Honza
Ojaswin Mujoo Oct. 6, 2022, 10:03 a.m. UTC | #4
On Thu, Oct 06, 2022 at 10:59:58AM +0200, Jan Kara wrote:
> On Thu 06-10-22 12:25:00, Ojaswin Mujoo wrote:
> > On Thu, Sep 29, 2022 at 02:53:11PM +0200, Jan Kara wrote:
> > > On Tue 27-09-22 14:46:48, Ojaswin Mujoo wrote:
> > > > Earlier, inode PAs were stored in a linked list. This caused a need to
> > > > periodically trim the list down inorder to avoid growing it to a very
> > > > large size, as this would severly affect performance during list
> > > > iteration.
> > > > 
> > > > Recent patches changed this list to an rbtree, and since the tree scales
> > > > up much better, we no longer need to have the trim functionality, hence
> > > > remove it.
> > > > 
> > > > Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> > > > Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> > > 
> > > I'm kind of wondering: Now there won't be performance issues with much
> > > more inode PAs but probably we don't want to let them grow completely out
> > > of control? E.g. I can imagine that if we'd have 1 billion of inode PAs
> > > attached to an inode, things would get wonky both in terms of memory
> > > consumption and also in terms of CPU time spent for the cases where we
> > > still do iterate all of the PAs... Is there anything which keeps inode PAs
> > > reasonably bounded?
> > > 
> > > 								Honza
> > > 
> > Hi Jan,
> > 
> > Sorry for the delay in response, I was on leave for the last few days.
> > 
> > So as per my understanding, after this patch, the only path where we
> > would need to traverse all the PAs is the ext4_discard_preallocations()
> > call where we discard all the PAs of an inode one by one (eg when
> > closing the file etc).  Such a discard is a colder path as we don't
> > usually expect to do it as often as say allocating blocks to an inode.
> > 
> > Originally, the limit was added in this patch [1] because of the time
> > lost in O(N) traversal in the allocation path (ext4_mb_use_preallocated
> > and ext4_mb_normalize_request). Since the rbtree addressed this
> > scalability issue we had decided to remove the trim logic in this
> > patchset.
> > 
> > [1]
> > https://lore.kernel.org/all/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com/
> 
> I agree the O(N) traversal is not in any performance sensitive path.
> 
> > That being said, I do agree that there should be some way to limit the
> > PAs from taking up an unreasonable amount of buddy space, memory and CPU
> > cycles in use cases like database files and disk files of long running
> > VMs. Previously the limit was 512 PAs per inode and trim was happening
> > in an LRU fashion, which is not very straightforward to implement in
> > trees. 
> > 
> > Another approach is rather than having a hard limit, we can throttle the
> > PAs based on some parameter like total active PAs in FS or FSUtil% of
> > the PAs but we might need to take care of fairness so one inode is not
> > holding all the PAs while others get throttled.
> > 
> > Anyways, I think the trimming part would need some brainstorming to get
> > right so just wondering if we could keep that as part of a separate
> > patchset and remove the trimming logic for now since rbtree has
> > addressed the scalability concerns in allocation path.
> 
> I agree the fact it took until 2020 for someone to notice inode PAs can
> be cumulating enough for full scan to matter on block allocation means that
> this is not a pressing issue. So I'm OK postponing it for now since I also
> don't have a great idea how to best trim excessive preallocations.
> 
> 								Honza
Right, so I think I'll post a [PATCH v1] with the changes you suggested
and keep this patch as it is for now.

Thanks,
Ojaswin
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
diff mbox series

Patch

diff --git a/Documentation/admin-guide/ext4.rst b/Documentation/admin-guide/ext4.rst
index 4c559e08d11e..5740d85439ff 100644
--- a/Documentation/admin-guide/ext4.rst
+++ b/Documentation/admin-guide/ext4.rst
@@ -489,9 +489,6 @@  Files in /sys/fs/ext4/<devname>:
         multiple of this tuning parameter if the stripe size is not set in the
         ext4 superblock
 
-  mb_max_inode_prealloc
-        The maximum length of per-inode ext4_prealloc_space list.
-
   mb_max_to_scan
         The maximum number of extents the multiblock allocator will search to
         find the best extent.
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d54b972f1f0f..bca4b41cc192 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1612,7 +1612,6 @@  struct ext4_sb_info {
 	unsigned int s_mb_stats;
 	unsigned int s_mb_order2_reqs;
 	unsigned int s_mb_group_prealloc;
-	unsigned int s_mb_max_inode_prealloc;
 	unsigned int s_max_dir_size_kb;
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cd19b9e84767..57e1ec88477a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3420,7 +3420,6 @@  int ext4_mb_init(struct super_block *sb)
 	sbi->s_mb_stats = MB_DEFAULT_STATS;
 	sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
 	sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
-	sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
 	/*
 	 * The default group preallocation is 512, which for 4k block
 	 * sizes translates to 2 megabytes.  However for bigalloc file
@@ -5546,29 +5545,11 @@  static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
 	return ;
 }
 
-/*
- * if per-inode prealloc list is too long, trim some PA
- */
-static void ext4_mb_trim_inode_pa(struct inode *inode)
-{
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-	int count, delta;
-
-	count = atomic_read(&ei->i_prealloc_active);
-	delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
-	if (count > sbi->s_mb_max_inode_prealloc + delta) {
-		count -= sbi->s_mb_max_inode_prealloc;
-		ext4_discard_preallocations(inode, count);
-	}
-}
-
 /*
  * release all resource we used in allocation
  */
 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 {
-	struct inode *inode = ac->ac_inode;
 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	struct ext4_prealloc_space *pa = ac->ac_pa;
 	if (pa) {
@@ -5604,7 +5585,6 @@  static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
 		mutex_unlock(&ac->ac_lg->lg_mutex);
 	ext4_mb_collect_stats(ac);
-	ext4_mb_trim_inode_pa(inode);
 	return 0;
 }
 
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index f8e8ee493867..6d85ee8674a6 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -73,11 +73,6 @@ 
  */
 #define MB_DEFAULT_GROUP_PREALLOC	512
 
-/*
- * maximum length of inode prealloc list
- */
-#define MB_DEFAULT_MAX_INODE_PREALLOC	512
-
 /*
  * Number of groups to search linearly before performing group scanning
  * optimization.
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index d233c24ea342..f0d42cf44c71 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -214,7 +214,6 @@  EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
-EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc);
 EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups);
 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
 EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
@@ -264,7 +263,6 @@  static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(mb_order2_req),
 	ATTR_LIST(mb_stream_req),
 	ATTR_LIST(mb_group_prealloc),
-	ATTR_LIST(mb_max_inode_prealloc),
 	ATTR_LIST(mb_max_linear_groups),
 	ATTR_LIST(max_writeback_mb_bump),
 	ATTR_LIST(extent_max_zeroout_kb),