diff mbox series

[v3,1/2] ext4: introduce EXT4_BG_WAS_TRIMMED to optimize trim

Message ID 1592831677-13945-1-git-send-email-wangshilong1991@gmail.com
State New
Headers show
Series [v3,1/2] ext4: introduce EXT4_BG_WAS_TRIMMED to optimize trim | expand

Commit Message

Wang Shilong June 22, 2020, 1:14 p.m. UTC
From: Wang Shilong <wshilong@ddn.com>

Currently WAS_TRIMMED flag is not persistent, whenever filesystem was
remounted, fstrim need walk all block groups again, the problem with
this is FSTRIM could be slow on very large LUN SSD based filesystem.

To avoid this kind of problem, we introduce a block group flag
EXT4_BG_WAS_TRIMMED, the side effect of this is we need introduce
extra one block group dirty write after trimming block group.

And When clearing TRIMMED flag, block group will be journalled
anyway, so it won't introduce any overhead.

Cc: Shuichi Ihara <sihara@ddn.com>
Cc: Andreas Dilger <adilger@dilger.ca>
Cc: Wang Shilong <wangshilong1991@gmail.com>
Signed-off-by: Wang Shilong <wshilong@ddn.com>
---
v2->v3:
don't renumber EXT4_GROUP_INFO_* bits.
v1->v2:
call ext4_journal_get_write_access() before modify buffer.
---
 fs/ext4/ext4.h      | 14 +++++------
 fs/ext4/ext4_jbd2.h |  3 ++-
 fs/ext4/mballoc.c   | 59 +++++++++++++++++++++++++++++++++------------
 3 files changed, 53 insertions(+), 23 deletions(-)

Comments

Andreas Dilger June 22, 2020, 5:18 p.m. UTC | #1
On Jun 22, 2020, at 7:14 AM, Wang Shilong <wangshilong1991@gmail.com> wrote:
> 
> From: Wang Shilong <wshilong@ddn.com>
> 
> Currently WAS_TRIMMED flag is not persistent, whenever filesystem was
> remounted, fstrim need walk all block groups again, the problem with
> this is FSTRIM could be slow on very large LUN SSD based filesystem.
> 
> To avoid this kind of problem, we introduce a block group flag
> EXT4_BG_WAS_TRIMMED, the side effect of this is we need introduce
> extra one block group dirty write after trimming block group.
> 
> And When clearing TRIMMED flag, block group will be journalled
> anyway, so it won't introduce any overhead.
> 
> Cc: Shuichi Ihara <sihara@ddn.com>
> Cc: Andreas Dilger <adilger@dilger.ca>
> Cc: Wang Shilong <wangshilong1991@gmail.com>
> Signed-off-by: Wang Shilong <wshilong@ddn.com>

Reviewed-by: Andreas Dilger <adilger@dilger.ca>

> ---
> v2->v3:
> don't renumber EXT4_GROUP_INFO_* bits.
> v1->v2:
> call ext4_journal_get_write_access() before modify buffer.
> ---
> fs/ext4/ext4.h      | 14 +++++------
> fs/ext4/ext4_jbd2.h |  3 ++-
> fs/ext4/mballoc.c   | 59 +++++++++++++++++++++++++++++++++------------
> 3 files changed, 53 insertions(+), 23 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 42f5060f3cdf..252754da2f1b 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -368,6 +368,7 @@ struct flex_groups {
> #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
> #define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
> #define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
> +#define EXT4_BG_WAS_TRIMMED	0x0008 /* block group was trimmed */
> 
> /*
>  * Macro-instructions used to manage group descriptors
> @@ -3138,7 +3139,6 @@ struct ext4_group_info {
> };
> 
> #define EXT4_GROUP_INFO_NEED_INIT_BIT		0
> -#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT		1
> #define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT	2
> #define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT	3
> #define EXT4_GROUP_INFO_BBITMAP_CORRUPT		\
> @@ -3153,12 +3153,12 @@ struct ext4_group_info {
> #define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)	\
> 	(test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
> 
> -#define EXT4_MB_GRP_WAS_TRIMMED(grp)	\
> -	(test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> -#define EXT4_MB_GRP_SET_TRIMMED(grp)	\
> -	(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> -#define EXT4_MB_GRP_CLEAR_TRIMMED(grp)	\
> -	(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> +#define EXT4_MB_GDP_WAS_TRIMMED(gdp)	\
> +	(gdp->bg_flags & cpu_to_le16(EXT4_BG_WAS_TRIMMED))
> +#define EXT4_MB_GDP_SET_TRIMMED(gdp)	\
> +	(gdp->bg_flags |= cpu_to_le16(EXT4_BG_WAS_TRIMMED))
> +#define EXT4_MB_GDP_CLEAR_TRIMMED(gdp)	\
> +	(gdp->bg_flags &= ~cpu_to_le16(EXT4_BG_WAS_TRIMMED))
> 
> #define EXT4_MAX_CONTENTION		8
> #define EXT4_CONTENTION_THRESHOLD	2
> diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
> index 00dc668e052b..a37e438f4b4d 100644
> --- a/fs/ext4/ext4_jbd2.h
> +++ b/fs/ext4/ext4_jbd2.h
> @@ -123,7 +123,8 @@
> #define EXT4_HT_MOVE_EXTENTS     9
> #define EXT4_HT_XATTR           10
> #define EXT4_HT_EXT_CONVERT     11
> -#define EXT4_HT_MAX             12
> +#define EXT4_HT_FS_TRIM		12
> +#define EXT4_HT_MAX             13
> 
> /**
>  *   struct ext4_journal_cb_entry - Base structure for callback information.
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index c0a331e2feb0..235a316584d0 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2923,15 +2923,6 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
> 	rb_erase(&entry->efd_node, &(db->bb_free_root));
> 	mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
> 
> -	/*
> -	 * Clear the trimmed flag for the group so that the next
> -	 * ext4_trim_fs can trim it.
> -	 * If the volume is mounted with -o discard, online discard
> -	 * is supported and the free blocks will be trimmed online.
> -	 */
> -	if (!test_opt(sb, DISCARD))
> -		EXT4_MB_GRP_CLEAR_TRIMMED(db);
> -
> 	if (!db->bb_free_root.rb_node) {
> 		/* No more items in the per group rb tree
> 		 * balance refcounts from ext4_mb_free_metadata()
> @@ -5084,8 +5075,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
> 					 " group:%d block:%d count:%lu failed"
> 					 " with %d", block_group, bit, count,
> 					 err);
> -		} else
> -			EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
> +		}
> 
> 		ext4_lock_group(sb, block_group);
> 		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
> @@ -5095,6 +5085,14 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
> 	ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
> 	ext4_free_group_clusters_set(sb, gdp, ret);
> 	ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
> +	/*
> +	 * Clear the trimmed flag for the group so that the next
> +	 * ext4_trim_fs can trim it.
> +	 * If the volume is mounted with -o discard, online discard
> +	 * is supported and the free blocks will be trimmed online.
> +	 */
> +	if (!test_opt(sb, DISCARD))
> +		EXT4_MB_GDP_CLEAR_TRIMMED(gdp);
> 	ext4_group_desc_csum_set(sb, block_group, gdp);
> 	ext4_unlock_group(sb, block_group);
> 
> @@ -5348,8 +5346,15 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
> 	ext4_grpblk_t next, count = 0, free_count = 0;
> 	struct ext4_buddy e4b;
> 	int ret = 0;
> +	struct ext4_group_desc *gdp;
> +	struct buffer_head *gdp_bh;
> 
> 	trace_ext4_trim_all_free(sb, group, start, max);
> +	gdp = ext4_get_group_desc(sb, group, &gdp_bh);
> +	if (!gdp) {
> +		ret = -EIO;
> +		return ret;
> +	}
> 
> 	ret = ext4_mb_load_buddy(sb, group, &e4b);
> 	if (ret) {
> @@ -5360,7 +5365,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
> 	bitmap = e4b.bd_bitmap;
> 
> 	ext4_lock_group(sb, group);
> -	if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
> +	if (EXT4_MB_GDP_WAS_TRIMMED(gdp) &&
> 	    minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
> 		goto out;
> 
> @@ -5399,14 +5404,38 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
> 			break;
> 	}
> 
> -	if (!ret) {
> +	if (!ret)
> 		ret = count;
> -		EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
> -	}
> out:
> 	ext4_unlock_group(sb, group);
> 	ext4_mb_unload_buddy(&e4b);
> +	if (ret > 0) {
> +		int err;
> +		handle_t *handle;
> 
> +		handle = ext4_journal_start_sb(sb, EXT4_HT_FS_TRIM, 1);
> +		if (IS_ERR(handle)) {
> +			ret = PTR_ERR(handle);
> +			goto out_return;
> +		}
> +		err = ext4_journal_get_write_access(handle, gdp_bh);
> +		if (err) {
> +			ret = err;
> +			goto out_journal;
> +		}
> +		ext4_lock_group(sb, group);
> +		EXT4_MB_GDP_SET_TRIMMED(gdp);
> +		ext4_group_desc_csum_set(sb, group, gdp);
> +		ext4_unlock_group(sb, group);
> +		err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
> +		if (err)
> +			ret = err;
> +out_journal:
> +		err = ext4_journal_stop(handle);
> +		if (err)
> +			ret = err;
> +	}
> +out_return:
> 	ext4_debug("trimmed %d blocks in the group %d\n",
> 		count, group);
> 
> --
> 2.25.4
> 


Cheers, Andreas
Theodore Y. Ts'o Aug. 6, 2020, 4:47 a.m. UTC | #2
On Mon, Jun 22, 2020 at 10:14:36PM +0900, Wang Shilong wrote:
> From: Wang Shilong <wshilong@ddn.com>
> 
> Currently WAS_TRIMMED flag is not persistent, whenever filesystem was
> remounted, fstrim need walk all block groups again, the problem with
> this is FSTRIM could be slow on very large LUN SSD based filesystem.
> 
> To avoid this kind of problem, we introduce a block group flag
> EXT4_BG_WAS_TRIMMED, the side effect of this is we need introduce
> extra one block group dirty write after trimming block group.
> 
> And When clearing TRIMMED flag, block group will be journalled
> anyway, so it won't introduce any overhead.

This persistent flag will not be accurate if there are blocks that
were freed in the block group in the same transaction, before
EXT4_BG_WAS_TRIMMED flag is set.

That's because we can't trim (or reuse) a block which has been
released until the transaction has committed, since if we crash before
it is commited, the file unlink or truncate will not have happened,
and so we can't trash the block until after the deallocation has been
freed.

This problem is also there with a non-persistent flag, granted; but
when the file system is unmounted and remounted, we will eventually
trim the block via a fstrim.  When we make the flag persistent, the
problem becomes worse, since it might mean that there are some blocks
that have been released, that might never get discarded.

I suppose the question is whether the sysadmin really wants unused
blocks to be discarded, either to not leak blocks in some kind of
thin-provisioned storage device, or if the sysadmin is depending on
the discard for some kind of security/privacy application (because
they know that a particular storage device actually has reliable,
secure discards), and how does that get balanced with sysadmins think
performance of fstrim is more important, especially if the device is
really slow at doing discard.

					- Ted
Wang Shilong Aug. 8, 2020, 1:29 a.m. UTC | #3
On Thu, Aug 6, 2020 at 12:47 PM <tytso@mit.edu> wrote:
>
> On Mon, Jun 22, 2020 at 10:14:36PM +0900, Wang Shilong wrote:
> > From: Wang Shilong <wshilong@ddn.com>
> >
> > Currently WAS_TRIMMED flag is not persistent, whenever filesystem was
> > remounted, fstrim need walk all block groups again, the problem with
> > this is FSTRIM could be slow on very large LUN SSD based filesystem.
> >
> > To avoid this kind of problem, we introduce a block group flag
> > EXT4_BG_WAS_TRIMMED, the side effect of this is we need introduce
> > extra one block group dirty write after trimming block group.
> >
> > And When clearing TRIMMED flag, block group will be journalled
> > anyway, so it won't introduce any overhead.
>
> This persistent flag will not be accurate if there are blocks that
> were freed in the block group in the same transaction, before
> EXT4_BG_WAS_TRIMMED flag is set.

Yup, i thought something like this, this might not be accurate, but this won't
cause some data corruptions etc, and trying to write data which was not
trimmed in advance, SSD will do erase finally.

And i sent e2fsprogs support which could clear all block groups
EXT4_BG_WAS_TRIMMED
flags, but it needs umounted state of course.

And for our case, when admin running fstrim on filesystem, usually there
are few IO for filesystem, so most of case, it might be fine.
>
> That's because we can't trim (or reuse) a block which has been
> released until the transaction has committed, since if we crash before
> it is commited, the file unlink or truncate will not have happened,
> and so we can't trash the block until after the deallocation has been
> freed.
>
> This problem is also there with a non-persistent flag, granted; but
> when the file system is unmounted and remounted, we will eventually
> trim the block via a fstrim.  When we make the flag persistent, the
> problem becomes worse, since it might mean that there are some blocks
> that have been released, that might never get discarded.
>
> I suppose the question is whether the sysadmin really wants unused
> blocks to be discarded, either to not leak blocks in some kind of
> thin-provisioned storage device, or if the sysadmin is depending on
> the discard for some kind of security/privacy application (because
> they know that a particular storage device actually has reliable,
> secure discards), and how does that get balanced with sysadmins think
> performance of fstrim is more important, especially if the device is
> really slow at doing discard.

Yup, that is good point, for our case, fstrim could take hours to complete
as it needs extra IO for disk arrays, so we really want repeated fstrim.

So what do you think extra mount option or a feature bit in the superblock.
In default, we still keep ext4 in previous behavior, but once turned
on it, we have
this optimized  "inaccurate" optimizations.

Thank you!
Shilong


>
>                                         - Ted
Theodore Y. Ts'o Aug. 8, 2020, 3:18 p.m. UTC | #4
On Sat, Aug 08, 2020 at 09:29:50AM +0800, Wang Shilong wrote:
> > I suppose the question is whether the sysadmin really wants unused
> > blocks to be discarded, either to not leak blocks in some kind of
> > thin-provisioned storage device, or if the sysadmin is depending on
> > the discard for some kind of security/privacy application (because
> > they know that a particular storage device actually has reliable,
> > secure discards), and how does that get balanced with sysadmins think
> > performance of fstrim is more important, especially if the device is
> > really slow at doing discard.
> 
> Yup, that is good point, for our case, fstrim could take hours to complete
> as it needs extra IO for disk arrays, so we really want repeated fstrim.
> 
> So what do you think extra mount option or a feature bit in the superblock.
> In default, we still keep ext4 in previous behavior, but once turned
> on it, we have this optimized  "inaccurate" optimizations.

So what I was thinking was we could define a new flag which would be
set in es->s_flags in the on-disk superblock:

#define EXT2_FLAGS_PERSISTENT_TRIM_TRACKING 0x0008

If this flag is set, then the EXT4_BG_WAS_TRIMMED flags will be
honored; otherwise, they will be ignored when FITRIM is executed and
the block group will be unconditionally trimmed.

The advantage of doing this way is that we don't need to allocate a
new feature bit, and older versions of e2fsck won't have heartburn
over seeing a feature bit it doesn't understand.  I also suspect this
is something that the system administrator will either always want
enabled or disabled, so it's better to make it be a tunable to be set
via tune2fs.

The other thing we could do is to define a new variant of the FITRIM
ioctl which will also force the unconditional trimming of the block
groups, so that an administrator can force trim all of the block
groups without needing to mess with mounting and unmounting the
superblock.

What do you think?

						- Ted
Andreas Dilger Aug. 9, 2020, 4:33 a.m. UTC | #5
On Aug 8, 2020, at 9:18 AM, tytso@mit.edu wrote:
> 
> On Sat, Aug 08, 2020 at 09:29:50AM +0800, Wang Shilong wrote:
>>> I suppose the question is whether the sysadmin really wants unused
>>> blocks to be discarded, either to not leak blocks in some kind of
>>> thin-provisioned storage device, or if the sysadmin is depending on
>>> the discard for some kind of security/privacy application (because
>>> they know that a particular storage device actually has reliable,
>>> secure discards), and how does that get balanced with sysadmins think
>>> performance of fstrim is more important, especially if the device is
>>> really slow at doing discard.
>> 
>> Yup, that is good point, for our case, fstrim could take hours to complete
>> as it needs extra IO for disk arrays, so we really want repeated fstrim.
>> 
>> So what do you think extra mount option or a feature bit in the superblock.
>> In default, we still keep ext4 in previous behavior, but once turned
>> on it, we have this optimized  "inaccurate" optimizations.
> 
> So what I was thinking was we could define a new flag which would be
> set in es->s_flags in the on-disk superblock:
> 
> #define EXT2_FLAGS_PERSISTENT_TRIM_TRACKING 0x0008
> 
> If this flag is set, then the EXT4_BG_WAS_TRIMMED flags will be
> honored; otherwise, they will be ignored when FITRIM is executed and
> the block group will be unconditionally trimmed.
> 
> The advantage of doing this way is that we don't need to allocate a
> new feature bit, and older versions of e2fsck won't have heartburn
> over seeing a feature bit it doesn't understand.  I also suspect this
> is something that the system administrator will either always want
> enabled or disabled, so it's better to make it be a tunable to be set
> via tune2fs.
> 
> The other thing we could do is to define a new variant of the FITRIM
> ioctl which will also force the unconditional trimming of the block
> groups, so that an administrator can force trim all of the block
> groups without needing to mess with mounting and unmounting the
> superblock.
> 
> What do you think?

What about storing "s_min_freed_blocks_to_trim" persistently in the
superblock, and then the admin can adjust this as desired?  If it is
set =1, then the "lazy trim" optimization would be disabled (every
FITRIM request would honor the trim requests whenever there is a
freed block in a group).  I suppose we could allow =0 to mean "do not
store the WAS_TRIMMED flag persistently", so there would be no change
for current behavior, and it would require a tune2fs option to set the
new value into the superblock (though we might consider setting this
to a non-zero value in mke2fs by default).



The other thing we were thinkgin about was changing the "-o discard" code
to leverage the WAS_TRIMMED flag, and just do bulk trim periodically
in the filesystem as blocks are freed from groups, rather than tracking
freed extents in memory and submitting trims actively during IO.  Instead,
it would track groups that exceed "s_min_freed_blocks_to_trim", and trim
the whole group in the background when the filesystem is not active.

Cheers, Andreas
Theodore Y. Ts'o Aug. 10, 2020, 1:24 p.m. UTC | #6
On Sat, Aug 08, 2020 at 10:33:08PM -0600, Andreas Dilger wrote:
> What about storing "s_min_freed_blocks_to_trim" persistently in the
> superblock, and then the admin can adjust this as desired?  If it is
> set =1, then the "lazy trim" optimization would be disabled (every
> FITRIM request would honor the trim requests whenever there is a
> freed block in a group).  I suppose we could allow =0 to mean "do not
> store the WAS_TRIMMED flag persistently", so there would be no change
> for current behavior, and it would require a tune2fs option to set the
> new value into the superblock (though we might consider setting this
> to a non-zero value in mke2fs by default).

Currently the the minimum blocks to trim is passed in to FITRIM from
userspace; so we would need to define how the passed-in value from the
fstrim program interacts with the value stored in the sueprblock.
Would we always ignore the value passed-in from userspace?  That
doesn't seem right...

> The other thing we were thinkgin about was changing the "-o discard" code
> to leverage the WAS_TRIMMED flag, and just do bulk trim periodically
> in the filesystem as blocks are freed from groups, rather than tracking
> freed extents in memory and submitting trims actively during IO.  Instead,
> it would track groups that exceed "s_min_freed_blocks_to_trim", and trim
> the whole group in the background when the filesystem is not active.

Hmm, maybe.  That's an awful lot of complexity, which is my concern
with that approach.

Part of the problem here is that discard is being used for different
things for different use cases and devices with different discard
speeds.  Right now, one of the primary uses of -o discard is for
people who have fast discard implementation(s and/or people who really
want to make sure every freed block is immediately discard --- perhaps
to meet security / privacy requirements (such as HIPPA compliance,
etc.).   I don't want to break that.

We now have a requirement of people who have very slow discards --- I
think at one point people mentioned something about for devices using
HDD, probably in some kind of dm-thin use case?  One solution that we
can use for those is simply use fstrim -m 8M or some such.  But it
appears that part of the problem is people do want more precision than
that?

Another solution might be to skip trimming block groups if there have
been blocks that have been freshly freed that are pending a commit,
and skip that block group until the commit has completed.  That might
also help reduce contention on a busy file system.

Yet another solution might be bias block allocations towards LBA
Uranges that have been deleted recently --- since another way to avoid
trims is to simply overwrite those LBA's.  But then the question is
how much memory are we willing to dedicate towards tracking recently
released LBA's, and to what level of granularity?  Perhaps we just
track the freed extents, and if they don't get used within a certain
period, or if we start getting put under memory pressure, we then send
the discards at that point.

Ultimately, though, this is a space full of trade offs, and I'm
reminded of one of my father's favorite Chinese sayings: "You're
demanding a horse which can run fast, but which doesn't eat much
grass." (又要马儿跑,又要马儿不吃草).  Or translated more
idiomatically, you can't have your cake and eat it too.  It seems this
desire transcends all cultures.  :-)

	       	   	      	   	- Ted
Andreas Dilger Aug. 12, 2020, 11:14 p.m. UTC | #7
On Aug 10, 2020, at 7:24 AM, tytso@mit.edu wrote:
> 
> On Sat, Aug 08, 2020 at 10:33:08PM -0600, Andreas Dilger wrote:
>> What about storing "s_min_freed_blocks_to_trim" persistently in the
>> superblock, and then the admin can adjust this as desired?  If it is
>> set =1, then the "lazy trim" optimization would be disabled (every
>> FITRIM request would honor the trim requests whenever there is a
>> freed block in a group).  I suppose we could allow =0 to mean "do not
>> store the WAS_TRIMMED flag persistently", so there would be no change
>> for current behavior, and it would require a tune2fs option to set the
>> new value into the superblock (though we might consider setting this
>> to a non-zero value in mke2fs by default).
> 
> Currently the the minimum blocks to trim is passed in to FITRIM from
> userspace; so we would need to define how the passed-in value from the
> fstrim program interacts with the value stored in the sueprblock.
> Would we always ignore the value passed-in from userspace?  That
> doesn't seem right...

The s_min_freed_blocks_to_trim value would decide whether the persistent
WAS_TRIMMED flag is cleared from the group or not when blocks are freed.
If =1 then every block freed in a group will clear the WAS_TRIMMED flag
(this is essentially the current behavior).  If > 1 (the default with
the second patch in the series) then multiple blocks need to be freed
from a group before it is reconsidered for trim, which is reasonable,
since few underlying devices have 1-block TRIM granularity and issuing
a new TRIM each time is just overhead.  If =0 then the WAS_TRIMMED flag
is never saved, and every FITRIM call will result in every group being
trimmed again, which is what currently happens when the filesystem is
newly mounted.

>> The other thing we were thinkgin about was changing the "-o discard" code
>> to leverage the WAS_TRIMMED flag, and just do bulk trim periodically
>> in the filesystem as blocks are freed from groups, rather than tracking
>> freed extents in memory and submitting trims actively during IO.  Instead,
>> it would track groups that exceed "s_min_freed_blocks_to_trim", and trim
>> the whole group in the background when the filesystem is not active.
> 
> Hmm, maybe.  That's an awful lot of complexity, which is my concern
> with that approach.
> 
> Part of the problem here is that discard is being used for different
> things for different use cases and devices with different discard
> speeds.  Right now, one of the primary uses of -o discard is for
> people who have fast discard implementations and/or people who really
> want to make sure every freed block is immediately discard --- perhaps
> to meet security / privacy requirements (such as HIPPA compliance,
> etc.).   I don't want to break that.
> 
> We now have a requirement of people who have very slow discards --- I
> think at one point people mentioned something about for devices using
> HDD, probably in some kind of dm-thin use case?  One solution that we
> can use for those is simply use fstrim -m 8M or some such.  But it
> appears that part of the problem is people do want more precision than
> that?

At least in the DDN case it isn't for HDDs but just for huge NVMe RAID
devices where the many millions of TRIM requests take a noticeable time.
Also, even if the TRIM is done by mke2fs, fstrim will TRIM the whole
filesystem again after each mount because this state is not persistently
stored in the group descriptors.

> Another solution might be to skip trimming block groups if there have
> been blocks that have been freshly freed that are pending a commit,
> and skip that block group until the commit has completed.  That might
> also help reduce contention on a busy file system.

I think even with fast TRIM devices, sending lots of them inline with
other requests causes IO serialization, so "-o discard" is bad for
performance and has complex in-memory tracking of extents to TRIM.  The
other drawback of "-o discard" is even with "perfect" TRIM, the size
or alignment may not be suitable for the underlying NAND, so TRIMs that
eventually cover the entire group could in fact release none of the
underlying space because individually they weren't the right size/offset.

The proposal is to allow efficiently (i.e. 1 bit per group on disk, one
int per group in memory) aggregating and tracking which groups can
benefit from a TRIM request (e.g. after multi-MB of blocks are freed
there), rather than "-o discard" sending hundreds of TRIMs when individual
blocks/extents are freed.  The alternative of running "fstrim" from cron
is also not ideal, because it is hard to know what interval to use, and
there is no way to know if the filesystem is idle and it is a good time
to run, or if it *was* idle but suddenly becomes busy after fstrim starts.


Adding filesystem load-detection for fstrim, so that it only trims groups
while idle and stops when there is IO, would solve the contention issue
with actual filesystem usage.  At that point, the "enhancement" would
just be to essentially have fstrim restart periodically internally when
"-o discard" is used (or maybe "-o fstrim" instead?), with optimizations
to speed up finding groups that need to be trimmed as opposed to a full
scan each time.  Maybe the optimizations are premature, and a full scan
is easy enough even when there are millions of groups?  I guess if mballoc
can do full-group scans then fstrim can do the same easily enough.


> Yet another solution might be bias block allocations towards LBA
> ranges that have been deleted recently --- since another way to avoid
> trims is to simply overwrite those LBA's.  But then the question is
> how much memory are we willing to dedicate towards tracking recently
> released LBA's, and to what level of granularity?  Perhaps we just
> track the freed extents, and if they don't get used within a certain
> period, or if we start getting put under memory pressure, we then send
> the discards at that point.

I'd think that overwriting just-freed blocks is the opposite of what we
want?  Overwriting NAND blocks results in an erase cycle, and TRIM is
intended to pre-erase the blocks so that this doesn't need to be done
when there is a new write.

> Ultimately, though, this is a space full of trade offs, and I'm
> reminded of one of my father's favorite Chinese sayings: "You're
> demanding a horse which can run fast, but which doesn't eat much
> grass." (又要马儿跑,又要马儿不吃草).  Or translated more
> idiomatically, you can't have your cake and eat it too.  It seems
> this desire transcends all cultures.  :-)


Cheers, Andreas
Christoph Hellwig Aug. 14, 2020, 8:06 a.m. UTC | #8
On Mon, Aug 10, 2020 at 09:24:57AM -0400, tytso@mit.edu wrote:
> Part of the problem here is that discard is being used for different
> things for different use cases and devices with different discard
> speeds.  Right now, one of the primary uses of -o discard is for
> people who have fast discard implementation(s and/or people who really
> want to make sure every freed block is immediately discard --- perhaps
> to meet security / privacy requirements (such as HIPPA compliance,
> etc.).   I don't want to break that.

Note that discard does not provide any security whatsover.  For one
none of the underlying primitives actually gurantee any action, the
device is free to always ignore parts or all of a discard request.

And even if it didn't that doesn't mean that data couldn't easily
recovered from the media.

> 
> We now have a requirement of people who have very slow discards --- I
> think at one point people mentioned something about for devices using
> HDD, probably in some kind of dm-thin use case?  One solution that we
> can use for those is simply use fstrim -m 8M or some such.  But it
> appears that part of the problem is people do want more precision than
> that?

Device managed SMR drivers usually support TRIM.  But it actually
should be a decently fast operation usually, as those drives have a
remapping layer just like a FTL.
diff mbox series

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 42f5060f3cdf..252754da2f1b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -368,6 +368,7 @@  struct flex_groups {
 #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
 #define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
 #define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
+#define EXT4_BG_WAS_TRIMMED	0x0008 /* block group was trimmed */
 
 /*
  * Macro-instructions used to manage group descriptors
@@ -3138,7 +3139,6 @@  struct ext4_group_info {
 };
 
 #define EXT4_GROUP_INFO_NEED_INIT_BIT		0
-#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT		1
 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT	2
 #define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT	3
 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT		\
@@ -3153,12 +3153,12 @@  struct ext4_group_info {
 #define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)	\
 	(test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
 
-#define EXT4_MB_GRP_WAS_TRIMMED(grp)	\
-	(test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
-#define EXT4_MB_GRP_SET_TRIMMED(grp)	\
-	(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
-#define EXT4_MB_GRP_CLEAR_TRIMMED(grp)	\
-	(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GDP_WAS_TRIMMED(gdp)	\
+	(gdp->bg_flags & cpu_to_le16(EXT4_BG_WAS_TRIMMED))
+#define EXT4_MB_GDP_SET_TRIMMED(gdp)	\
+	(gdp->bg_flags |= cpu_to_le16(EXT4_BG_WAS_TRIMMED))
+#define EXT4_MB_GDP_CLEAR_TRIMMED(gdp)	\
+	(gdp->bg_flags &= ~cpu_to_le16(EXT4_BG_WAS_TRIMMED))
 
 #define EXT4_MAX_CONTENTION		8
 #define EXT4_CONTENTION_THRESHOLD	2
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 00dc668e052b..a37e438f4b4d 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -123,7 +123,8 @@ 
 #define EXT4_HT_MOVE_EXTENTS     9
 #define EXT4_HT_XATTR           10
 #define EXT4_HT_EXT_CONVERT     11
-#define EXT4_HT_MAX             12
+#define EXT4_HT_FS_TRIM		12
+#define EXT4_HT_MAX             13
 
 /**
  *   struct ext4_journal_cb_entry - Base structure for callback information.
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c0a331e2feb0..235a316584d0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2923,15 +2923,6 @@  static void ext4_free_data_in_buddy(struct super_block *sb,
 	rb_erase(&entry->efd_node, &(db->bb_free_root));
 	mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
 
-	/*
-	 * Clear the trimmed flag for the group so that the next
-	 * ext4_trim_fs can trim it.
-	 * If the volume is mounted with -o discard, online discard
-	 * is supported and the free blocks will be trimmed online.
-	 */
-	if (!test_opt(sb, DISCARD))
-		EXT4_MB_GRP_CLEAR_TRIMMED(db);
-
 	if (!db->bb_free_root.rb_node) {
 		/* No more items in the per group rb tree
 		 * balance refcounts from ext4_mb_free_metadata()
@@ -5084,8 +5075,7 @@  void ext4_free_blocks(handle_t *handle, struct inode *inode,
 					 " group:%d block:%d count:%lu failed"
 					 " with %d", block_group, bit, count,
 					 err);
-		} else
-			EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
+		}
 
 		ext4_lock_group(sb, block_group);
 		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
@@ -5095,6 +5085,14 @@  void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
 	ext4_free_group_clusters_set(sb, gdp, ret);
 	ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
+	/*
+	 * Clear the trimmed flag for the group so that the next
+	 * ext4_trim_fs can trim it.
+	 * If the volume is mounted with -o discard, online discard
+	 * is supported and the free blocks will be trimmed online.
+	 */
+	if (!test_opt(sb, DISCARD))
+		EXT4_MB_GDP_CLEAR_TRIMMED(gdp);
 	ext4_group_desc_csum_set(sb, block_group, gdp);
 	ext4_unlock_group(sb, block_group);
 
@@ -5348,8 +5346,15 @@  ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
 	ext4_grpblk_t next, count = 0, free_count = 0;
 	struct ext4_buddy e4b;
 	int ret = 0;
+	struct ext4_group_desc *gdp;
+	struct buffer_head *gdp_bh;
 
 	trace_ext4_trim_all_free(sb, group, start, max);
+	gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+	if (!gdp) {
+		ret = -EIO;
+		return ret;
+	}
 
 	ret = ext4_mb_load_buddy(sb, group, &e4b);
 	if (ret) {
@@ -5360,7 +5365,7 @@  ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
 	bitmap = e4b.bd_bitmap;
 
 	ext4_lock_group(sb, group);
-	if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
+	if (EXT4_MB_GDP_WAS_TRIMMED(gdp) &&
 	    minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
 		goto out;
 
@@ -5399,14 +5404,38 @@  ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
 			break;
 	}
 
-	if (!ret) {
+	if (!ret)
 		ret = count;
-		EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
-	}
 out:
 	ext4_unlock_group(sb, group);
 	ext4_mb_unload_buddy(&e4b);
+	if (ret > 0) {
+		int err;
+		handle_t *handle;
 
+		handle = ext4_journal_start_sb(sb, EXT4_HT_FS_TRIM, 1);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out_return;
+		}
+		err = ext4_journal_get_write_access(handle, gdp_bh);
+		if (err) {
+			ret = err;
+			goto out_journal;
+		}
+		ext4_lock_group(sb, group);
+		EXT4_MB_GDP_SET_TRIMMED(gdp);
+		ext4_group_desc_csum_set(sb, group, gdp);
+		ext4_unlock_group(sb, group);
+		err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
+		if (err)
+			ret = err;
+out_journal:
+		err = ext4_journal_stop(handle);
+		if (err)
+			ret = err;
+	}
+out_return:
 	ext4_debug("trimmed %d blocks in the group %d\n",
 		count, group);