diff mbox series

[V2] ext4: introduce EXT4_BG_TRIMMED to optimize fstrim

Message ID 20230817003504.458920-1-dongyangli@ddn.com
State New
Headers show
Series [V2] ext4: introduce EXT4_BG_TRIMMED to optimize fstrim | expand

Commit Message

Li Dongyang Aug. 17, 2023, 12:35 a.m. UTC
Currently the flag indicating block group has done fstrim is not
persistent, and trim status will be lost after remount, as
a result fstrim can not skip the already trimmed groups, which
could be slow on very large devices.

This patch introduces a new block group flag EXT4_BG_TRIMMED,
we need 1 extra block group descriptor write after trimming each
block group.
When clearing the flag, the block group descriptor is journalled
already so no extra overhead.

Add a new super block flag EXT2_FLAGS_TRACK_TRIM, to indicate if
we should honour and set EXT4_BG_TRIMMED when doing fstrim.
The new super block flag can be turned on/off via tune2fs.

Cc: Shuichi Ihara <sihara@ddn.com>
Cc: Andreas Dilger <adilger@dilger.ca>
Cc: Wang Shilong <wangshilong1991@gmail.com>
Signed-off-by: Wang Shilong <wshilong@ddn.com>
Signed-off-by: Li Dongyang <dongyangli@ddn.com>
---
v1->v2:
use cpu_to_le32() with the new super flag.
do not record BG_TRIMMED if TRACK_TRIM is not set in super block.
---
 fs/ext4/ext4.h      | 10 ++-----
 fs/ext4/ext4_jbd2.h |  3 ++-
 fs/ext4/mballoc.c   | 63 +++++++++++++++++++++++++++++++++++----------
 3 files changed, 53 insertions(+), 23 deletions(-)

Comments

Andreas Dilger Aug. 18, 2023, 12:12 a.m. UTC | #1
On Aug 16, 2023, at 6:35 PM, Li Dongyang <dongyangli@ddn.com> wrote:
> 
> Currently the flag indicating block group has done fstrim is not
> persistent, and trim status will be lost after remount, as
> a result fstrim can not skip the already trimmed groups, which
> could be slow on very large devices.
> 
> This patch introduces a new block group flag EXT4_BG_TRIMMED,
> we need 1 extra block group descriptor write after trimming each
> block group.
> When clearing the flag, the block group descriptor is journalled
> already so no extra overhead.
> 
> Add a new super block flag EXT2_FLAGS_TRACK_TRIM, to indicate if
> we should honour and set EXT4_BG_TRIMMED when doing fstrim.
> The new super block flag can be turned on/off via tune2fs.

We discussed this patch on the ext4 developer concall again today,
and Ted agreed the EXT4_FLAGS_TRACK_TRIM flag was OK, and should be
enabled by default in mke2fs (as it already is), otherwise most users
will not benefit from this feature.  Being able to turn this off in
case of problems is still convenient.

There was some discussion about whether the BG_TRIMMED flag should be
set on groups with BLOCK_UNINIT, because the loading of the block
bitmap during trim would initialize the bitmap itself and clear the
BLOCK_UNINIT flag.

Ted's comment on the previous review was:
>> This patch introduces a new block group flag EXT4_BG_TRIMMED,
>> we need 1 extra block group descriptor write after trimming each
>> block group. When clearing the flag, the block group descriptor
>> is journaled already so no extra overhead.
> 
> ... we should not try to set the flag if the
> block group is unitialized, and we should actually send the discard in
> that case, since presumably the blocks in question were discard when
> the file system was mkfs'ed.

For newly-formatted filesystems with the BG_TRIMMED support, the flag
is already set at mke2fs on every group after a successful full device
discard, so nothing further is needed I think.

If EXT2_FLAGS_TRACK_TRIM is enabled on an existing filesystem *after*
it is already used, then previous fstrim calls would likely have already
loaded the block bitmap and trimmed the groups so setting BG_TRIMMED
in this case should be fine, regardless of whether BLOCK_UNINIT.

Cheers, Andreas

> Signed-off-by: Wang Shilong <wshilong@ddn.com>
> Signed-off-by: Li Dongyang <dongyangli@ddn.com>
> ---
> v1->v2:
> use cpu_to_le32() with the new super flag.
> do not record BG_TRIMMED if TRACK_TRIM is not set in super block.
> ---
> fs/ext4/ext4.h      | 10 ++-----
> fs/ext4/ext4_jbd2.h |  3 ++-
> fs/ext4/mballoc.c   | 63 +++++++++++++++++++++++++++++++++++----------
> 3 files changed, 53 insertions(+), 23 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 0a2d55faa095..a990fb49b24f 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -437,6 +437,7 @@ struct flex_groups {
> #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
> #define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
> #define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
> +#define EXT4_BG_TRIMMED		0x0008 /* block group was trimmed */
> 
> /*
>  * Macro-instructions used to manage group descriptors
> @@ -1166,6 +1167,7 @@ struct ext4_inode_info {
> #define EXT2_FLAGS_SIGNED_HASH		0x0001  /* Signed dirhash in use */
> #define EXT2_FLAGS_UNSIGNED_HASH	0x0002  /* Unsigned dirhash in use */
> #define EXT2_FLAGS_TEST_FILESYS		0x0004	/* to test development code */
> +#define EXT2_FLAGS_TRACK_TRIM		0x0008  /* Track trim status in each bg */
> 
> /*
>  * Mount flags set via mount options or defaults
> @@ -3412,7 +3414,6 @@ struct ext4_group_info {
> };
> 
> #define EXT4_GROUP_INFO_NEED_INIT_BIT		0
> -#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT		1
> #define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT	2
> #define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT	3
> #define EXT4_GROUP_INFO_BBITMAP_CORRUPT		\
> @@ -3427,13 +3428,6 @@ struct ext4_group_info {
> 	(test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
> #define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)	\
> 	(test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
> -
> -#define EXT4_MB_GRP_WAS_TRIMMED(grp)	\
> -	(test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> -#define EXT4_MB_GRP_SET_TRIMMED(grp)	\
> -	(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> -#define EXT4_MB_GRP_CLEAR_TRIMMED(grp)	\
> -	(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> #define EXT4_MB_GRP_TEST_AND_SET_READ(grp)	\
> 	(test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
> 
> diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
> index 0c77697d5e90..ce529a454b2a 100644
> --- a/fs/ext4/ext4_jbd2.h
> +++ b/fs/ext4/ext4_jbd2.h
> @@ -120,7 +120,8 @@
> #define EXT4_HT_MOVE_EXTENTS     9
> #define EXT4_HT_XATTR           10
> #define EXT4_HT_EXT_CONVERT     11
> -#define EXT4_HT_MAX             12
> +#define EXT4_HT_FS_TRIM		12
> +#define EXT4_HT_MAX             13
> 
> /**
>  *   struct ext4_journal_cb_entry - Base structure for callback information.
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 21b903fe546e..d537bcdf121d 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -3849,15 +3849,6 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
> 	rb_erase(&entry->efd_node, &(db->bb_free_root));
> 	mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
> 
> -	/*
> -	 * Clear the trimmed flag for the group so that the next
> -	 * ext4_trim_fs can trim it.
> -	 * If the volume is mounted with -o discard, online discard
> -	 * is supported and the free blocks will be trimmed online.
> -	 */
> -	if (!test_opt(sb, DISCARD))
> -		EXT4_MB_GRP_CLEAR_TRIMMED(db);
> -
> 	if (!db->bb_free_root.rb_node) {
> 		/* No more items in the per group rb tree
> 		 * balance refcounts from ext4_mb_free_metadata()
> @@ -6587,8 +6578,7 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
> 					 " group:%u block:%d count:%lu failed"
> 					 " with %d", block_group, bit, count,
> 					 err);
> -		} else
> -			EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
> +		}
> 
> 		ext4_lock_group(sb, block_group);
> 		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
> @@ -6598,6 +6588,14 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
> 	ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
> 	ext4_free_group_clusters_set(sb, gdp, ret);
> 	ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
> +	/*
> +	 * Clear the trimmed flag for the group so that the next
> +	 * ext4_trim_fs can trim it.
> +	 * If the volume is mounted with -o discard, online discard
> +	 * is supported and the free blocks will be trimmed online.
> +	 */
> +	if (!test_opt(sb, DISCARD))
> +		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_TRIMMED);
> 	ext4_group_desc_csum_set(sb, block_group, gdp);
> 	ext4_unlock_group(sb, block_group);
> 
> @@ -6995,10 +6993,19 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
> 		   ext4_grpblk_t minblocks, bool set_trimmed)
> {
> 	struct ext4_buddy e4b;
> +	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
> +	struct ext4_group_desc *gdp;
> +	struct buffer_head *gd_bh;
> 	int ret;
> 
> 	trace_ext4_trim_all_free(sb, group, start, max);
> 
> +	gdp = ext4_get_group_desc(sb, group, &gd_bh);
> +	if (!gdp) {
> +		ret = -EIO;
> +		return ret;
> +	}
> +
> 	ret = ext4_mb_load_buddy(sb, group, &e4b);
> 	if (ret) {
> 		ext4_warning(sb, "Error %d loading buddy information for %u",
> @@ -7008,11 +7015,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
> 
> 	ext4_lock_group(sb, group);
> 
> -	if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
> +	if (!(es->s_flags & cpu_to_le32(EXT2_FLAGS_TRACK_TRIM) &&
> +	      gdp->bg_flags & cpu_to_le16(EXT4_BG_TRIMMED)) ||
> 	    minblocks < EXT4_SB(sb)->s_last_trim_minblks) {
> 		ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
> -		if (ret >= 0 && set_trimmed)
> -			EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
> 	} else {
> 		ret = 0;
> 	}
> @@ -7020,6 +7026,35 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
> 	ext4_unlock_group(sb, group);
> 	ext4_mb_unload_buddy(&e4b);
> 
> +	if (ret > 0 && set_trimmed &&
> +	    es->s_flags & cpu_to_le32(EXT2_FLAGS_TRACK_TRIM)) {
> +		int err;
> +		handle_t *handle;
> +
> +		handle = ext4_journal_start_sb(sb, EXT4_HT_FS_TRIM, 1);
> +		if (IS_ERR(handle)) {
> +			ret = PTR_ERR(handle);
> +			goto out_return;
> +		}
> +		err = ext4_journal_get_write_access(handle, sb, gd_bh,
> +						    EXT4_JTR_NONE);
> +		if (err) {
> +			ret = err;
> +			goto out_journal;
> +		}
> +		ext4_lock_group(sb, group);
> +		gdp->bg_flags |= cpu_to_le16(EXT4_BG_TRIMMED);
> +		ext4_group_desc_csum_set(sb, group, gdp);
> +		ext4_unlock_group(sb, group);
> +		err = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
> +		if (err)
> +			ret = err;
> +out_journal:
> +		err = ext4_journal_stop(handle);
> +		if (err)
> +			ret = err;
> +	}
> +out_return:
> 	ext4_debug("trimmed %d blocks in the group %d\n",
> 		ret, group);
> 
> --
> 2.41.0
> 


Cheers, Andreas
diff mbox series

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0a2d55faa095..a990fb49b24f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -437,6 +437,7 @@  struct flex_groups {
 #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
 #define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
 #define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
+#define EXT4_BG_TRIMMED		0x0008 /* block group was trimmed */
 
 /*
  * Macro-instructions used to manage group descriptors
@@ -1166,6 +1167,7 @@  struct ext4_inode_info {
 #define EXT2_FLAGS_SIGNED_HASH		0x0001  /* Signed dirhash in use */
 #define EXT2_FLAGS_UNSIGNED_HASH	0x0002  /* Unsigned dirhash in use */
 #define EXT2_FLAGS_TEST_FILESYS		0x0004	/* to test development code */
+#define EXT2_FLAGS_TRACK_TRIM		0x0008  /* Track trim status in each bg */
 
 /*
  * Mount flags set via mount options or defaults
@@ -3412,7 +3414,6 @@  struct ext4_group_info {
 };
 
 #define EXT4_GROUP_INFO_NEED_INIT_BIT		0
-#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT		1
 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT	2
 #define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT	3
 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT		\
@@ -3427,13 +3428,6 @@  struct ext4_group_info {
 	(test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
 #define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)	\
 	(test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
-
-#define EXT4_MB_GRP_WAS_TRIMMED(grp)	\
-	(test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
-#define EXT4_MB_GRP_SET_TRIMMED(grp)	\
-	(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
-#define EXT4_MB_GRP_CLEAR_TRIMMED(grp)	\
-	(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
 #define EXT4_MB_GRP_TEST_AND_SET_READ(grp)	\
 	(test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
 
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 0c77697d5e90..ce529a454b2a 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -120,7 +120,8 @@ 
 #define EXT4_HT_MOVE_EXTENTS     9
 #define EXT4_HT_XATTR           10
 #define EXT4_HT_EXT_CONVERT     11
-#define EXT4_HT_MAX             12
+#define EXT4_HT_FS_TRIM		12
+#define EXT4_HT_MAX             13
 
 /**
  *   struct ext4_journal_cb_entry - Base structure for callback information.
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 21b903fe546e..d537bcdf121d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3849,15 +3849,6 @@  static void ext4_free_data_in_buddy(struct super_block *sb,
 	rb_erase(&entry->efd_node, &(db->bb_free_root));
 	mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
 
-	/*
-	 * Clear the trimmed flag for the group so that the next
-	 * ext4_trim_fs can trim it.
-	 * If the volume is mounted with -o discard, online discard
-	 * is supported and the free blocks will be trimmed online.
-	 */
-	if (!test_opt(sb, DISCARD))
-		EXT4_MB_GRP_CLEAR_TRIMMED(db);
-
 	if (!db->bb_free_root.rb_node) {
 		/* No more items in the per group rb tree
 		 * balance refcounts from ext4_mb_free_metadata()
@@ -6587,8 +6578,7 @@  static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
 					 " group:%u block:%d count:%lu failed"
 					 " with %d", block_group, bit, count,
 					 err);
-		} else
-			EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
+		}
 
 		ext4_lock_group(sb, block_group);
 		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
@@ -6598,6 +6588,14 @@  static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
 	ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
 	ext4_free_group_clusters_set(sb, gdp, ret);
 	ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
+	/*
+	 * Clear the trimmed flag for the group so that the next
+	 * ext4_trim_fs can trim it.
+	 * If the volume is mounted with -o discard, online discard
+	 * is supported and the free blocks will be trimmed online.
+	 */
+	if (!test_opt(sb, DISCARD))
+		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_TRIMMED);
 	ext4_group_desc_csum_set(sb, block_group, gdp);
 	ext4_unlock_group(sb, block_group);
 
@@ -6995,10 +6993,19 @@  ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
 		   ext4_grpblk_t minblocks, bool set_trimmed)
 {
 	struct ext4_buddy e4b;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct ext4_group_desc *gdp;
+	struct buffer_head *gd_bh;
 	int ret;
 
 	trace_ext4_trim_all_free(sb, group, start, max);
 
+	gdp = ext4_get_group_desc(sb, group, &gd_bh);
+	if (!gdp) {
+		ret = -EIO;
+		return ret;
+	}
+
 	ret = ext4_mb_load_buddy(sb, group, &e4b);
 	if (ret) {
 		ext4_warning(sb, "Error %d loading buddy information for %u",
@@ -7008,11 +7015,10 @@  ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
 
 	ext4_lock_group(sb, group);
 
-	if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
+	if (!(es->s_flags & cpu_to_le32(EXT2_FLAGS_TRACK_TRIM) &&
+	      gdp->bg_flags & cpu_to_le16(EXT4_BG_TRIMMED)) ||
 	    minblocks < EXT4_SB(sb)->s_last_trim_minblks) {
 		ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
-		if (ret >= 0 && set_trimmed)
-			EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
 	} else {
 		ret = 0;
 	}
@@ -7020,6 +7026,35 @@  ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
 	ext4_unlock_group(sb, group);
 	ext4_mb_unload_buddy(&e4b);
 
+	if (ret > 0 && set_trimmed &&
+	    es->s_flags & cpu_to_le32(EXT2_FLAGS_TRACK_TRIM)) {
+		int err;
+		handle_t *handle;
+
+		handle = ext4_journal_start_sb(sb, EXT4_HT_FS_TRIM, 1);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out_return;
+		}
+		err = ext4_journal_get_write_access(handle, sb, gd_bh,
+						    EXT4_JTR_NONE);
+		if (err) {
+			ret = err;
+			goto out_journal;
+		}
+		ext4_lock_group(sb, group);
+		gdp->bg_flags |= cpu_to_le16(EXT4_BG_TRIMMED);
+		ext4_group_desc_csum_set(sb, group, gdp);
+		ext4_unlock_group(sb, group);
+		err = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+		if (err)
+			ret = err;
+out_journal:
+		err = ext4_journal_stop(handle);
+		if (err)
+			ret = err;
+	}
+out_return:
 	ext4_debug("trimmed %d blocks in the group %d\n",
 		ret, group);