diff mbox series

ext4: introduce EXT4_BG_WAS_TRIMMED to optimize trim

Message ID 1590565130-23773-1-git-send-email-wangshilong1991@gmail.com
State New
Headers show
Series ext4: introduce EXT4_BG_WAS_TRIMMED to optimize trim | expand

Commit Message

Wang Shilong May 27, 2020, 7:38 a.m. UTC
From: Wang Shilong <wshilong@ddn.com>

Currently WAS_TRIMMED flag is not persistent, whenever filesystem was
remounted, fstrim need walk all block groups again, the problem with
this is FSTRIM could be slow on very large LUN SSD based filesystem.

To avoid this kind of problem, we introduce a block group flag
EXT4_BG_WAS_TRIMMED, the side effect of this is we need introduce
extra one block group dirty write after trimming block group.

And When clearing TRIMMED flag, block group will be journalled
anyway, so it won't introduce any overhead.

Cc: Shuichi Ihara <sihara@ddn.com>
Cc: Andreas Dilger <adilger@dilger.ca>
Cc: Wang Shilong <wangshilong1991@gmail.com>
Signed-off-by: Wang Shilong <wshilong@ddn.com>
---
 fs/ext4/ext4.h      | 18 +++++++--------
 fs/ext4/ext4_jbd2.h |  3 ++-
 fs/ext4/mballoc.c   | 54 ++++++++++++++++++++++++++++++++++-----------
 3 files changed, 52 insertions(+), 23 deletions(-)

Comments

Lukas Czerner May 27, 2020, 9:19 a.m. UTC | #1
On Wed, May 27, 2020 at 04:38:50PM +0900, Wang Shilong wrote:
> From: Wang Shilong <wshilong@ddn.com>
> 
> Currently WAS_TRIMMED flag is not persistent, whenever filesystem was
> remounted, fstrim need walk all block groups again, the problem with
> this is FSTRIM could be slow on very large LUN SSD based filesystem.
> 
> To avoid this kind of problem, we introduce a block group flag
> EXT4_BG_WAS_TRIMMED, the side effect of this is we need introduce
> extra one block group dirty write after trimming block group.

Hi

that's fair enough, however once you make this persistent we also need
to have a way to clear the flag, or at least bypass it. Storage can be
changed and if it does we might want to re-run the fstrim.

We also need to set this flag in mke2fs and e2fsck if appropriate.

more below...

> 
> And When clearing TRIMMED flag, block group will be journalled
> anyway, so it won't introduce any overhead.
> 
> Cc: Shuichi Ihara <sihara@ddn.com>
> Cc: Andreas Dilger <adilger@dilger.ca>
> Cc: Wang Shilong <wangshilong1991@gmail.com>
> Signed-off-by: Wang Shilong <wshilong@ddn.com>
> ---
>  fs/ext4/ext4.h      | 18 +++++++--------
>  fs/ext4/ext4_jbd2.h |  3 ++-
>  fs/ext4/mballoc.c   | 54 ++++++++++++++++++++++++++++++++++-----------
>  3 files changed, 52 insertions(+), 23 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index ad2dbf6e4924..23c2dc529a28 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -357,6 +357,7 @@ struct flex_groups {
>  #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
>  #define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
>  #define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
> +#define EXT4_BG_WAS_TRIMMED	0x0008 /* block group was trimmed */
>  
>  /*
>   * Macro-instructions used to manage group descriptors
> @@ -3112,9 +3113,8 @@ struct ext4_group_info {
>  };
>  
>  #define EXT4_GROUP_INFO_NEED_INIT_BIT		0
> -#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT		1
> -#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT	2
> -#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT	3
> +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT	1
> +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT	2
>  #define EXT4_GROUP_INFO_BBITMAP_CORRUPT		\
>  	(1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
>  #define EXT4_GROUP_INFO_IBITMAP_CORRUPT		\
> @@ -3127,12 +3127,12 @@ struct ext4_group_info {
>  #define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)	\
>  	(test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
>  
> -#define EXT4_MB_GRP_WAS_TRIMMED(grp)	\
> -	(test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> -#define EXT4_MB_GRP_SET_TRIMMED(grp)	\
> -	(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> -#define EXT4_MB_GRP_CLEAR_TRIMMED(grp)	\
> -	(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> +#define EXT4_MB_GDP_WAS_TRIMMED(gdp)	\
> +	(gdp->bg_flags & cpu_to_le16(EXT4_BG_WAS_TRIMMED))
> +#define EXT4_MB_GDP_SET_TRIMMED(gdp)	\
> +	(gdp->bg_flags |= cpu_to_le16(EXT4_BG_WAS_TRIMMED))
> +#define EXT4_MB_GDP_CLEAR_TRIMMED(gdp)	\
> +	(gdp->bg_flags &= ~cpu_to_le16(EXT4_BG_WAS_TRIMMED))
>  
>  #define EXT4_MAX_CONTENTION		8
>  #define EXT4_CONTENTION_THRESHOLD	2
> diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
> index 4b9002f0e84c..4094a5b247f7 100644
> --- a/fs/ext4/ext4_jbd2.h
> +++ b/fs/ext4/ext4_jbd2.h
> @@ -123,7 +123,8 @@
>  #define EXT4_HT_MOVE_EXTENTS     9
>  #define EXT4_HT_XATTR           10
>  #define EXT4_HT_EXT_CONVERT     11
> -#define EXT4_HT_MAX             12
> +#define EXT4_HT_FS_TRIM		12
> +#define EXT4_HT_MAX             13
>  
>  /**
>   *   struct ext4_journal_cb_entry - Base structure for callback information.
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 30d5d97548c4..d25377948994 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2829,15 +2829,6 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
>  	rb_erase(&entry->efd_node, &(db->bb_free_root));
>  	mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
>  
> -	/*
> -	 * Clear the trimmed flag for the group so that the next
> -	 * ext4_trim_fs can trim it.
> -	 * If the volume is mounted with -o discard, online discard
> -	 * is supported and the free blocks will be trimmed online.
> -	 */
> -	if (!test_opt(sb, DISCARD))
> -		EXT4_MB_GRP_CLEAR_TRIMMED(db);
> -
>  	if (!db->bb_free_root.rb_node) {
>  		/* No more items in the per group rb tree
>  		 * balance refcounts from ext4_mb_free_metadata()
> @@ -4928,8 +4919,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
>  					 " group:%d block:%d count:%lu failed"
>  					 " with %d", block_group, bit, count,
>  					 err);
> -		} else
> -			EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
> +		}
>  
>  		ext4_lock_group(sb, block_group);
>  		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
> @@ -4939,6 +4929,14 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
>  	ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
>  	ext4_free_group_clusters_set(sb, gdp, ret);
>  	ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
> +	/*
> +	 * Clear the trimmed flag for the group so that the next
> +	 * ext4_trim_fs can trim it.
> +	 * If the volume is mounted with -o discard, online discard
> +	 * is supported and the free blocks will be trimmed online.
> +	 */
> +	if (!test_opt(sb, DISCARD))
> +		EXT4_MB_GDP_CLEAR_TRIMMED(gdp);
>  	ext4_group_desc_csum_set(sb, block_group, gdp);
>  	ext4_unlock_group(sb, block_group);
>  
> @@ -5192,8 +5190,15 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
>  	ext4_grpblk_t next, count = 0, free_count = 0;
>  	struct ext4_buddy e4b;
>  	int ret = 0;
> +	struct ext4_group_desc *gdp;
> +	struct buffer_head *gdp_bh;
>  
>  	trace_ext4_trim_all_free(sb, group, start, max);
> +	gdp = ext4_get_group_desc(sb, group, &gdp_bh);
> +	if (!gdp) {
> +		ret = -EIO;
> +		return ret;
> +	}
>  
>  	ret = ext4_mb_load_buddy(sb, group, &e4b);
>  	if (ret) {
> @@ -5204,7 +5209,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
>  	bitmap = e4b.bd_bitmap;
>  
>  	ext4_lock_group(sb, group);
> -	if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
> +	if (EXT4_MB_GDP_WAS_TRIMMED(gdp) &&
>  	    minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
>  		goto out;
>  
> @@ -5245,12 +5250,35 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
>  
>  	if (!ret) {
>  		ret = count;
> -		EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
> +		EXT4_MB_GDP_SET_TRIMMED(gdp);
> +		ext4_group_desc_csum_set(sb, group, gdp);
>  	}
>  out:
>  	ext4_unlock_group(sb, group);
>  	ext4_mb_unload_buddy(&e4b);
> +	if (ret > 0) {
> +		int err;
> +		handle_t *handle;
>  
> +		handle = ext4_journal_start_sb(sb, EXT4_HT_FS_TRIM, 1);
> +		if (IS_ERR(handle)) {
> +			ret = PTR_ERR(handle);
> +			goto out_return;
> +		}
> +		err = ext4_journal_get_write_access(handle, gdp_bh);
> +		if (err) {
> +			ret = err;
> +			goto out_journal;
> +		}

Don't we need to do this before we set the flag in gdp?

-Lukas

> +		err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
> +		if (err)
> +			ret = err;
> +out_journal:
> +		err = ext4_journal_stop(handle);
> +		if (err)
> +			ret = err;
> +	}
> +out_return:
>  	ext4_debug("trimmed %d blocks in the group %d\n",
>  		count, group);
>  
> -- 
> 2.25.4
>
Reindl Harald May 27, 2020, 9:32 a.m. UTC | #2
Am 27.05.20 um 11:19 schrieb Lukas Czerner:
> On Wed, May 27, 2020 at 04:38:50PM +0900, Wang Shilong wrote:
>> From: Wang Shilong <wshilong@ddn.com>
>>
>> Currently WAS_TRIMMED flag is not persistent, whenever filesystem was
>> remounted, fstrim need walk all block groups again, the problem with
>> this is FSTRIM could be slow on very large LUN SSD based filesystem.
>>
>> To avoid this kind of problem, we introduce a block group flag
>> EXT4_BG_WAS_TRIMMED, the side effect of this is we need introduce
>> extra one block group dirty write after trimming block group.

would that also fix the issue that *way too much* is trimmed all the
time, no matter if it's a thin provisioned vmware disk or a phyiscal
RAID10 with SSD

no way of 315 MB deletes within 2 hours or so on a system with just 485M
used

[root@firewall:~]$  fstrim -av
/boot: 0 B (0 bytes) trimmed on /dev/sda1
/: 315.2 MiB (330522624 bytes) trimmed on /dev/sdb1

[root@firewall:~]$  df
Filesystem     Type  Size  Used Avail Use% Mounted on
/dev/sdb1      ext4  5.8G  463M  5.4G   8% /
/dev/sda1      ext4  485M   42M  440M   9% /boot
Lukas Czerner May 27, 2020, 9:57 a.m. UTC | #3
On Wed, May 27, 2020 at 11:32:02AM +0200, Reindl Harald wrote:
> 
> 
> Am 27.05.20 um 11:19 schrieb Lukas Czerner:
> > On Wed, May 27, 2020 at 04:38:50PM +0900, Wang Shilong wrote:
> >> From: Wang Shilong <wshilong@ddn.com>
> >>
> >> Currently WAS_TRIMMED flag is not persistent, whenever filesystem was
> >> remounted, fstrim need walk all block groups again, the problem with
> >> this is FSTRIM could be slow on very large LUN SSD based filesystem.
> >>
> >> To avoid this kind of problem, we introduce a block group flag
> >> EXT4_BG_WAS_TRIMMED, the side effect of this is we need introduce
> >> extra one block group dirty write after trimming block group.
> 
> would that also fix the issue that *way too much* is trimmed all the
> time, no matter if it's a thin provisioned vmware disk or a phyiscal
> RAID10 with SSD

no, the mechanism remains the same, but the proposal is to make it
pesisten across re-mounts.

> 
> no way of 315 MB deletes within 2 hours or so on a system with just 485M
> used

The reason is that we're working on block group granularity. So if you
have almost free block group, and you free some blocks from it, the flag
gets freed and next time you run fstrim it'll trim all the free space in
the group. Then again if you free some blocks from the group, the flags
gets cleared again ...

But I don't think this is a problem at all. Certainly not worth tracking
free/trimmed extents to solve it.

> 
> [root@firewall:~]$  fstrim -av
> /boot: 0 B (0 bytes) trimmed on /dev/sda1
> /: 315.2 MiB (330522624 bytes) trimmed on /dev/sdb1

The solution is to run fstrim less often, that's the whole point of the
fstrim. If you need to run it more often, then you probably want to use
-o discard mount option.

-Lukas

> 
> [root@firewall:~]$  df
> Filesystem     Type  Size  Used Avail Use% Mounted on
> /dev/sdb1      ext4  5.8G  463M  5.4G   8% /
> /dev/sda1      ext4  485M   42M  440M   9% /boot
>
Wang Shilong May 27, 2020, 10:06 a.m. UTC | #4
On Wed, May 27, 2020 at 5:19 PM Lukas Czerner <lczerner@redhat.com> wrote:
>
> On Wed, May 27, 2020 at 04:38:50PM +0900, Wang Shilong wrote:
> > From: Wang Shilong <wshilong@ddn.com>
> >
> > Currently WAS_TRIMMED flag is not persistent, whenever filesystem was
> > remounted, fstrim need walk all block groups again, the problem with
> > this is FSTRIM could be slow on very large LUN SSD based filesystem.
> >
> > To avoid this kind of problem, we introduce a block group flag
> > EXT4_BG_WAS_TRIMMED, the side effect of this is we need introduce
> > extra one block group dirty write after trimming block group.
>
> Hi
>
> that's fair enough, however once you make this persistent we also need
> to have a way to clear the flag, or at least bypass it. Storage can be
> changed and if it does we might want to re-run the fstrim.

Yup, i thought about that.

1) we might add an mount option or sys interface, something force_fstrim
2) Add an option to e2fsck to force clear this block group flag.

>
> We also need to set this flag in mke2fs and e2fsck if appropriate.
>

Good point.

> more below...
>
> >
> > And When clearing TRIMMED flag, block group will be journalled
> > anyway, so it won't introduce any overhead.
> >
> > Cc: Shuichi Ihara <sihara@ddn.com>
> > Cc: Andreas Dilger <adilger@dilger.ca>
> > Cc: Wang Shilong <wangshilong1991@gmail.com>
> > Signed-off-by: Wang Shilong <wshilong@ddn.com>
> > ---
> >  fs/ext4/ext4.h      | 18 +++++++--------
> >  fs/ext4/ext4_jbd2.h |  3 ++-
> >  fs/ext4/mballoc.c   | 54 ++++++++++++++++++++++++++++++++++-----------
> >  3 files changed, 52 insertions(+), 23 deletions(-)
> >
> > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > index ad2dbf6e4924..23c2dc529a28 100644
> > --- a/fs/ext4/ext4.h
> > +++ b/fs/ext4/ext4.h
> > @@ -357,6 +357,7 @@ struct flex_groups {
> >  #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
> >  #define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
> >  #define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
> > +#define EXT4_BG_WAS_TRIMMED  0x0008 /* block group was trimmed */
> >
> >  /*
> >   * Macro-instructions used to manage group descriptors
> > @@ -3112,9 +3113,8 @@ struct ext4_group_info {
> >  };
> >
> >  #define EXT4_GROUP_INFO_NEED_INIT_BIT                0
> > -#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT              1
> > -#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT  2
> > -#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT  3
> > +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT  1
> > +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT  2
> >  #define EXT4_GROUP_INFO_BBITMAP_CORRUPT              \
> >       (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
> >  #define EXT4_GROUP_INFO_IBITMAP_CORRUPT              \
> > @@ -3127,12 +3127,12 @@ struct ext4_group_info {
> >  #define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)     \
> >       (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
> >
> > -#define EXT4_MB_GRP_WAS_TRIMMED(grp) \
> > -     (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> > -#define EXT4_MB_GRP_SET_TRIMMED(grp) \
> > -     (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> > -#define EXT4_MB_GRP_CLEAR_TRIMMED(grp)       \
> > -     (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> > +#define EXT4_MB_GDP_WAS_TRIMMED(gdp) \
> > +     (gdp->bg_flags & cpu_to_le16(EXT4_BG_WAS_TRIMMED))
> > +#define EXT4_MB_GDP_SET_TRIMMED(gdp) \
> > +     (gdp->bg_flags |= cpu_to_le16(EXT4_BG_WAS_TRIMMED))
> > +#define EXT4_MB_GDP_CLEAR_TRIMMED(gdp)       \
> > +     (gdp->bg_flags &= ~cpu_to_le16(EXT4_BG_WAS_TRIMMED))
> >
> >  #define EXT4_MAX_CONTENTION          8
> >  #define EXT4_CONTENTION_THRESHOLD    2
> > diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
> > index 4b9002f0e84c..4094a5b247f7 100644
> > --- a/fs/ext4/ext4_jbd2.h
> > +++ b/fs/ext4/ext4_jbd2.h
> > @@ -123,7 +123,8 @@
> >  #define EXT4_HT_MOVE_EXTENTS     9
> >  #define EXT4_HT_XATTR           10
> >  #define EXT4_HT_EXT_CONVERT     11
> > -#define EXT4_HT_MAX             12
> > +#define EXT4_HT_FS_TRIM              12
> > +#define EXT4_HT_MAX             13
> >
> >  /**
> >   *   struct ext4_journal_cb_entry - Base structure for callback information.
> > diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> > index 30d5d97548c4..d25377948994 100644
> > --- a/fs/ext4/mballoc.c
> > +++ b/fs/ext4/mballoc.c
> > @@ -2829,15 +2829,6 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
> >       rb_erase(&entry->efd_node, &(db->bb_free_root));
> >       mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
> >
> > -     /*
> > -      * Clear the trimmed flag for the group so that the next
> > -      * ext4_trim_fs can trim it.
> > -      * If the volume is mounted with -o discard, online discard
> > -      * is supported and the free blocks will be trimmed online.
> > -      */
> > -     if (!test_opt(sb, DISCARD))
> > -             EXT4_MB_GRP_CLEAR_TRIMMED(db);
> > -
> >       if (!db->bb_free_root.rb_node) {
> >               /* No more items in the per group rb tree
> >                * balance refcounts from ext4_mb_free_metadata()
> > @@ -4928,8 +4919,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
> >                                        " group:%d block:%d count:%lu failed"
> >                                        " with %d", block_group, bit, count,
> >                                        err);
> > -             } else
> > -                     EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
> > +             }
> >
> >               ext4_lock_group(sb, block_group);
> >               mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
> > @@ -4939,6 +4929,14 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
> >       ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
> >       ext4_free_group_clusters_set(sb, gdp, ret);
> >       ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
> > +     /*
> > +      * Clear the trimmed flag for the group so that the next
> > +      * ext4_trim_fs can trim it.
> > +      * If the volume is mounted with -o discard, online discard
> > +      * is supported and the free blocks will be trimmed online.
> > +      */
> > +     if (!test_opt(sb, DISCARD))
> > +             EXT4_MB_GDP_CLEAR_TRIMMED(gdp);
> >       ext4_group_desc_csum_set(sb, block_group, gdp);
> >       ext4_unlock_group(sb, block_group);
> >
> > @@ -5192,8 +5190,15 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
> >       ext4_grpblk_t next, count = 0, free_count = 0;
> >       struct ext4_buddy e4b;
> >       int ret = 0;
> > +     struct ext4_group_desc *gdp;
> > +     struct buffer_head *gdp_bh;
> >
> >       trace_ext4_trim_all_free(sb, group, start, max);
> > +     gdp = ext4_get_group_desc(sb, group, &gdp_bh);
> > +     if (!gdp) {
> > +             ret = -EIO;
> > +             return ret;
> > +     }
> >
> >       ret = ext4_mb_load_buddy(sb, group, &e4b);
> >       if (ret) {
> > @@ -5204,7 +5209,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
> >       bitmap = e4b.bd_bitmap;
> >
> >       ext4_lock_group(sb, group);
> > -     if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
> > +     if (EXT4_MB_GDP_WAS_TRIMMED(gdp) &&
> >           minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
> >               goto out;
> >
> > @@ -5245,12 +5250,35 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
> >
> >       if (!ret) {
> >               ret = count;
> > -             EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
> > +             EXT4_MB_GDP_SET_TRIMMED(gdp);
> > +             ext4_group_desc_csum_set(sb, group, gdp);
> >       }
> >  out:
> >       ext4_unlock_group(sb, group);
> >       ext4_mb_unload_buddy(&e4b);
> > +     if (ret > 0) {
> > +             int err;
> > +             handle_t *handle;
> >
> > +             handle = ext4_journal_start_sb(sb, EXT4_HT_FS_TRIM, 1);
> > +             if (IS_ERR(handle)) {
> > +                     ret = PTR_ERR(handle);
> > +                     goto out_return;
> > +             }
> > +             err = ext4_journal_get_write_access(handle, gdp_bh);
> > +             if (err) {
> > +                     ret = err;
> > +                     goto out_journal;
> > +             }
>
> Don't we need to do this before we set the flag in gdp?
>

Sorry about this, you are right.

> -Lukas
>
> > +             err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
> > +             if (err)
> > +                     ret = err;
> > +out_journal:
> > +             err = ext4_journal_stop(handle);
> > +             if (err)
> > +                     ret = err;
> > +     }
> > +out_return:
> >       ext4_debug("trimmed %d blocks in the group %d\n",
> >               count, group);
> >
> > --
> > 2.25.4
> >
>
Reindl Harald May 27, 2020, 10:11 a.m. UTC | #5
Am 27.05.20 um 11:57 schrieb Lukas Czerner:
> On Wed, May 27, 2020 at 11:32:02AM +0200, Reindl Harald wrote:
>>
>>
>> Am 27.05.20 um 11:19 schrieb Lukas Czerner:
>>> On Wed, May 27, 2020 at 04:38:50PM +0900, Wang Shilong wrote:
>>>> From: Wang Shilong <wshilong@ddn.com>
>>>>
>>>> Currently WAS_TRIMMED flag is not persistent, whenever filesystem was
>>>> remounted, fstrim need walk all block groups again, the problem with
>>>> this is FSTRIM could be slow on very large LUN SSD based filesystem.
>>>>
>>>> To avoid this kind of problem, we introduce a block group flag
>>>> EXT4_BG_WAS_TRIMMED, the side effect of this is we need introduce
>>>> extra one block group dirty write after trimming block group.
>>
>> would that also fix the issue that *way too much* is trimmed all the
>> time, no matter if it's a thin provisioned vmware disk or a phyiscal
>> RAID10 with SSD
> 
> no, the mechanism remains the same, but the proposal is to make it
> pesisten across re-mounts.
> 
>>
>> no way of 315 MB deletes within 2 hours or so on a system with just 485M
>> used
> 
> The reason is that we're working on block group granularity. So if you
> have almost free block group, and you free some blocks from it, the flag
> gets freed and next time you run fstrim it'll trim all the free space in
> the group. Then again if you free some blocks from the group, the flags
> gets cleared again ...
> 
> But I don't think this is a problem at all. Certainly not worth tracking
> free/trimmed extents to solve it.

it is a problem

on a daily "fstrim -av" you trim gigabytes of alredy trimmed blocks
which for example on a vmware thin provisioned vdisk makes it down to
CBT (changed-block-tracking)

so instead completly ignore that untouched space thanks to CBT it's
considered as changed and verified in the follow up backup run which
takes magnitutdes longer than needed

without that behavior our daily backups would take 3 minutes instead 1
hour but without fstrim the backup grows with useless temp data over time
Lukas Czerner May 27, 2020, 10:21 a.m. UTC | #6
On Wed, May 27, 2020 at 06:06:09PM +0800, Wang Shilong wrote:
> On Wed, May 27, 2020 at 5:19 PM Lukas Czerner <lczerner@redhat.com> wrote:
> >
> > On Wed, May 27, 2020 at 04:38:50PM +0900, Wang Shilong wrote:
> > > From: Wang Shilong <wshilong@ddn.com>
> > >
> > > Currently WAS_TRIMMED flag is not persistent, whenever filesystem was
> > > remounted, fstrim need walk all block groups again, the problem with
> > > this is FSTRIM could be slow on very large LUN SSD based filesystem.
> > >
> > > To avoid this kind of problem, we introduce a block group flag
> > > EXT4_BG_WAS_TRIMMED, the side effect of this is we need introduce
> > > extra one block group dirty write after trimming block group.
> >
> > Hi
> >
> > that's fair enough, however once you make this persistent we also need
> > to have a way to clear the flag, or at least bypass it. Storage can be
> > changed and if it does we might want to re-run the fstrim.
> 
> Yup, i thought about that.
> 
> 1) we might add an mount option or sys interface, something force_fstrim
> 2) Add an option to e2fsck to force clear this block group flag.

Option for tune2fs to clear the flags sounds way better than yet another
mount option.

Thanks!
-Lukas

> 
> >
> > We also need to set this flag in mke2fs and e2fsck if appropriate.
> >
> 
> Good point.
> 
> > more below...
> >
> > >
> > > And When clearing TRIMMED flag, block group will be journalled
> > > anyway, so it won't introduce any overhead.
> > >
> > > Cc: Shuichi Ihara <sihara@ddn.com>
> > > Cc: Andreas Dilger <adilger@dilger.ca>
> > > Cc: Wang Shilong <wangshilong1991@gmail.com>
> > > Signed-off-by: Wang Shilong <wshilong@ddn.com>
> > > ---
> > >  fs/ext4/ext4.h      | 18 +++++++--------
> > >  fs/ext4/ext4_jbd2.h |  3 ++-
> > >  fs/ext4/mballoc.c   | 54 ++++++++++++++++++++++++++++++++++-----------
> > >  3 files changed, 52 insertions(+), 23 deletions(-)
> > >
> > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > > index ad2dbf6e4924..23c2dc529a28 100644
> > > --- a/fs/ext4/ext4.h
> > > +++ b/fs/ext4/ext4.h
> > > @@ -357,6 +357,7 @@ struct flex_groups {
> > >  #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
> > >  #define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
> > >  #define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
> > > +#define EXT4_BG_WAS_TRIMMED  0x0008 /* block group was trimmed */
> > >
> > >  /*
> > >   * Macro-instructions used to manage group descriptors
> > > @@ -3112,9 +3113,8 @@ struct ext4_group_info {
> > >  };
> > >
> > >  #define EXT4_GROUP_INFO_NEED_INIT_BIT                0
> > > -#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT              1
> > > -#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT  2
> > > -#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT  3
> > > +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT  1
> > > +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT  2
> > >  #define EXT4_GROUP_INFO_BBITMAP_CORRUPT              \
> > >       (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
> > >  #define EXT4_GROUP_INFO_IBITMAP_CORRUPT              \
> > > @@ -3127,12 +3127,12 @@ struct ext4_group_info {
> > >  #define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)     \
> > >       (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
> > >
> > > -#define EXT4_MB_GRP_WAS_TRIMMED(grp) \
> > > -     (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> > > -#define EXT4_MB_GRP_SET_TRIMMED(grp) \
> > > -     (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> > > -#define EXT4_MB_GRP_CLEAR_TRIMMED(grp)       \
> > > -     (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> > > +#define EXT4_MB_GDP_WAS_TRIMMED(gdp) \
> > > +     (gdp->bg_flags & cpu_to_le16(EXT4_BG_WAS_TRIMMED))
> > > +#define EXT4_MB_GDP_SET_TRIMMED(gdp) \
> > > +     (gdp->bg_flags |= cpu_to_le16(EXT4_BG_WAS_TRIMMED))
> > > +#define EXT4_MB_GDP_CLEAR_TRIMMED(gdp)       \
> > > +     (gdp->bg_flags &= ~cpu_to_le16(EXT4_BG_WAS_TRIMMED))
> > >
> > >  #define EXT4_MAX_CONTENTION          8
> > >  #define EXT4_CONTENTION_THRESHOLD    2
> > > diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
> > > index 4b9002f0e84c..4094a5b247f7 100644
> > > --- a/fs/ext4/ext4_jbd2.h
> > > +++ b/fs/ext4/ext4_jbd2.h
> > > @@ -123,7 +123,8 @@
> > >  #define EXT4_HT_MOVE_EXTENTS     9
> > >  #define EXT4_HT_XATTR           10
> > >  #define EXT4_HT_EXT_CONVERT     11
> > > -#define EXT4_HT_MAX             12
> > > +#define EXT4_HT_FS_TRIM              12
> > > +#define EXT4_HT_MAX             13
> > >
> > >  /**
> > >   *   struct ext4_journal_cb_entry - Base structure for callback information.
> > > diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> > > index 30d5d97548c4..d25377948994 100644
> > > --- a/fs/ext4/mballoc.c
> > > +++ b/fs/ext4/mballoc.c
> > > @@ -2829,15 +2829,6 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
> > >       rb_erase(&entry->efd_node, &(db->bb_free_root));
> > >       mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
> > >
> > > -     /*
> > > -      * Clear the trimmed flag for the group so that the next
> > > -      * ext4_trim_fs can trim it.
> > > -      * If the volume is mounted with -o discard, online discard
> > > -      * is supported and the free blocks will be trimmed online.
> > > -      */
> > > -     if (!test_opt(sb, DISCARD))
> > > -             EXT4_MB_GRP_CLEAR_TRIMMED(db);
> > > -
> > >       if (!db->bb_free_root.rb_node) {
> > >               /* No more items in the per group rb tree
> > >                * balance refcounts from ext4_mb_free_metadata()
> > > @@ -4928,8 +4919,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
> > >                                        " group:%d block:%d count:%lu failed"
> > >                                        " with %d", block_group, bit, count,
> > >                                        err);
> > > -             } else
> > > -                     EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
> > > +             }
> > >
> > >               ext4_lock_group(sb, block_group);
> > >               mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
> > > @@ -4939,6 +4929,14 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
> > >       ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
> > >       ext4_free_group_clusters_set(sb, gdp, ret);
> > >       ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
> > > +     /*
> > > +      * Clear the trimmed flag for the group so that the next
> > > +      * ext4_trim_fs can trim it.
> > > +      * If the volume is mounted with -o discard, online discard
> > > +      * is supported and the free blocks will be trimmed online.
> > > +      */
> > > +     if (!test_opt(sb, DISCARD))
> > > +             EXT4_MB_GDP_CLEAR_TRIMMED(gdp);
> > >       ext4_group_desc_csum_set(sb, block_group, gdp);
> > >       ext4_unlock_group(sb, block_group);
> > >
> > > @@ -5192,8 +5190,15 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
> > >       ext4_grpblk_t next, count = 0, free_count = 0;
> > >       struct ext4_buddy e4b;
> > >       int ret = 0;
> > > +     struct ext4_group_desc *gdp;
> > > +     struct buffer_head *gdp_bh;
> > >
> > >       trace_ext4_trim_all_free(sb, group, start, max);
> > > +     gdp = ext4_get_group_desc(sb, group, &gdp_bh);
> > > +     if (!gdp) {
> > > +             ret = -EIO;
> > > +             return ret;
> > > +     }
> > >
> > >       ret = ext4_mb_load_buddy(sb, group, &e4b);
> > >       if (ret) {
> > > @@ -5204,7 +5209,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
> > >       bitmap = e4b.bd_bitmap;
> > >
> > >       ext4_lock_group(sb, group);
> > > -     if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
> > > +     if (EXT4_MB_GDP_WAS_TRIMMED(gdp) &&
> > >           minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
> > >               goto out;
> > >
> > > @@ -5245,12 +5250,35 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
> > >
> > >       if (!ret) {
> > >               ret = count;
> > > -             EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
> > > +             EXT4_MB_GDP_SET_TRIMMED(gdp);
> > > +             ext4_group_desc_csum_set(sb, group, gdp);
> > >       }
> > >  out:
> > >       ext4_unlock_group(sb, group);
> > >       ext4_mb_unload_buddy(&e4b);
> > > +     if (ret > 0) {
> > > +             int err;
> > > +             handle_t *handle;
> > >
> > > +             handle = ext4_journal_start_sb(sb, EXT4_HT_FS_TRIM, 1);
> > > +             if (IS_ERR(handle)) {
> > > +                     ret = PTR_ERR(handle);
> > > +                     goto out_return;
> > > +             }
> > > +             err = ext4_journal_get_write_access(handle, gdp_bh);
> > > +             if (err) {
> > > +                     ret = err;
> > > +                     goto out_journal;
> > > +             }
> >
> > Don't we need to do this before we set the flag in gdp?
> >
> 
> Sorry about this, you are right.
> 
> > -Lukas
> >
> > > +             err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
> > > +             if (err)
> > > +                     ret = err;
> > > +out_journal:
> > > +             err = ext4_journal_stop(handle);
> > > +             if (err)
> > > +                     ret = err;
> > > +     }
> > > +out_return:
> > >       ext4_debug("trimmed %d blocks in the group %d\n",
> > >               count, group);
> > >
> > > --
> > > 2.25.4
> > >
> >
>
Lukas Czerner May 27, 2020, 10:32 a.m. UTC | #7
On Wed, May 27, 2020 at 12:11:52PM +0200, Reindl Harald wrote:
> 
> Am 27.05.20 um 11:57 schrieb Lukas Czerner:
> > On Wed, May 27, 2020 at 11:32:02AM +0200, Reindl Harald wrote:
> >>
> >>
> >> Am 27.05.20 um 11:19 schrieb Lukas Czerner:
> >>> On Wed, May 27, 2020 at 04:38:50PM +0900, Wang Shilong wrote:
> >>>> From: Wang Shilong <wshilong@ddn.com>
> >>>>
> >>>> Currently WAS_TRIMMED flag is not persistent, whenever filesystem was
> >>>> remounted, fstrim need walk all block groups again, the problem with
> >>>> this is FSTRIM could be slow on very large LUN SSD based filesystem.
> >>>>
> >>>> To avoid this kind of problem, we introduce a block group flag
> >>>> EXT4_BG_WAS_TRIMMED, the side effect of this is we need introduce
> >>>> extra one block group dirty write after trimming block group.
> >>
> >> would that also fix the issue that *way too much* is trimmed all the
> >> time, no matter if it's a thin provisioned vmware disk or a phyiscal
> >> RAID10 with SSD
> > 
> > no, the mechanism remains the same, but the proposal is to make it
> > pesisten across re-mounts.
> > 
> >>
> >> no way of 315 MB deletes within 2 hours or so on a system with just 485M
> >> used
> > 
> > The reason is that we're working on block group granularity. So if you
> > have almost free block group, and you free some blocks from it, the flag
> > gets freed and next time you run fstrim it'll trim all the free space in
> > the group. Then again if you free some blocks from the group, the flags
> > gets cleared again ...
> > 
> > But I don't think this is a problem at all. Certainly not worth tracking
> > free/trimmed extents to solve it.
> 
> it is a problem
> 
> on a daily "fstrim -av" you trim gigabytes of alredy trimmed blocks
> which for example on a vmware thin provisioned vdisk makes it down to
> CBT (changed-block-tracking)
> 
> so instead completly ignore that untouched space thanks to CBT it's
> considered as changed and verified in the follow up backup run which
> takes magnitutdes longer than needed

Looks like you identified the problem then ;)

But seriously, trim/discard was always considered advisory and the
storage is completely free to do whatever it wants to do with the
information. I might even be the case that the discard requests are
ignored and we might not even need optimization like this. But
regardless it does take time to go through the block gropus and as a
result this optimization is useful in the fs itself.

However it seems to me that the situation you're describing calls for
optimization on a storage side (TP vdisk in your case), not file system
side.

And again, for fine grained discard you can use -o discard.

-Lukas

> 
> without that behavior our daily backups would take 3 minutes instead 1
> hour but without fstrim the backup grows with useless temp data over time
>
Reindl Harald May 27, 2020, 10:56 a.m. UTC | #8
Am 27.05.20 um 12:32 schrieb Lukas Czerner:
> On Wed, May 27, 2020 at 12:11:52PM +0200, Reindl Harald wrote:
>>
>> Am 27.05.20 um 11:57 schrieb Lukas Czerner:
>>> On Wed, May 27, 2020 at 11:32:02AM +0200, Reindl Harald wrote:
>>>>
>>>>
>>>> Am 27.05.20 um 11:19 schrieb Lukas Czerner:
>>>>> On Wed, May 27, 2020 at 04:38:50PM +0900, Wang Shilong wrote:
>>>>>> From: Wang Shilong <wshilong@ddn.com>
>>>>>>
>>>>>> Currently WAS_TRIMMED flag is not persistent, whenever filesystem was
>>>>>> remounted, fstrim need walk all block groups again, the problem with
>>>>>> this is FSTRIM could be slow on very large LUN SSD based filesystem.
>>>>>>
>>>>>> To avoid this kind of problem, we introduce a block group flag
>>>>>> EXT4_BG_WAS_TRIMMED, the side effect of this is we need introduce
>>>>>> extra one block group dirty write after trimming block group.
>>>>
>>>> would that also fix the issue that *way too much* is trimmed all the
>>>> time, no matter if it's a thin provisioned vmware disk or a phyiscal
>>>> RAID10 with SSD
>>>
>>> no, the mechanism remains the same, but the proposal is to make it
>>> pesisten across re-mounts.
>>>
>>>>
>>>> no way of 315 MB deletes within 2 hours or so on a system with just 485M
>>>> used
>>>
>>> The reason is that we're working on block group granularity. So if you
>>> have almost free block group, and you free some blocks from it, the flag
>>> gets freed and next time you run fstrim it'll trim all the free space in
>>> the group. Then again if you free some blocks from the group, the flags
>>> gets cleared again ...
>>>
>>> But I don't think this is a problem at all. Certainly not worth tracking
>>> free/trimmed extents to solve it.
>>
>> it is a problem
>>
>> on a daily "fstrim -av" you trim gigabytes of alredy trimmed blocks
>> which for example on a vmware thin provisioned vdisk makes it down to
>> CBT (changed-block-tracking)
>>
>> so instead completly ignore that untouched space thanks to CBT it's
>> considered as changed and verified in the follow up backup run which
>> takes magnitutdes longer than needed
> 
> Looks like you identified the problem then ;)

well, in a perfect world.....

> But seriously, trim/discard was always considered advisory and the
> storage is completely free to do whatever it wants to do with the
> information. I might even be the case that the discard requests are
> ignored and we might not even need optimization like this. But
> regardless it does take time to go through the block gropus and as a
> result this optimization is useful in the fs itself.

luckily at least fstrim is non-blocking in a vmware environment, on my
physical box it takes ages

this machine *does nothing* than wait to be cloned, 235 MB pretended
deleted data within 50 minutes is absurd on a completly idle guest

so even when i am all in for optimizations thatÄs way over top

[root@master:~]$ fstrim -av
/boot: 0 B (0 bytes) trimmed on /dev/sda1
/: 235.8 MiB (247201792 bytes) trimmed on /dev/sdb1

[root@master:~]$ df
Filesystem     Type  Size  Used Avail Use% Mounted on
/dev/sdb1      ext4  5.8G  502M  5.3G   9% /
/dev/sda1      ext4  485M   39M  443M   9% /boot

> However it seems to me that the situation you're describing calls for
> optimization on a storage side (TP vdisk in your case), not file system
> side.
> 
> And again, for fine grained discard you can use -o discard

with a terrible performance impact at runtime
Andreas Dilger May 27, 2020, 7:11 p.m. UTC | #9
On May 27, 2020, at 4:56 AM, Reindl Harald <h.reindl@thelounge.net> wrote:
> Am 27.05.20 um 12:32 schrieb Lukas Czerner:
>> On Wed, May 27, 2020 at 12:11:52PM +0200, Reindl Harald wrote:
>>> 
>>> Am 27.05.20 um 11:57 schrieb Lukas Czerner:
>>>> On Wed, May 27, 2020 at 11:32:02AM +0200, Reindl Harald wrote:
>>>>> would that also fix the issue that *way too much* is trimmed all the
>>>>> time, no matter if it's a thin provisioned vmware disk or a phyiscal
>>>>> RAID10 with SSD
>>>> 
>>>> no, the mechanism remains the same, but the proposal is to make it
>>>> pesisten across re-mounts.
>>>> 
>>>>> 
>>>>> no way of 315 MB deletes within 2 hours or so on a system with just 485M
>>>>> used
>>>> 
>>>> The reason is that we're working on block group granularity. So if you
>>>> have almost free block group, and you free some blocks from it, the flag
>>>> gets freed and next time you run fstrim it'll trim all the free space in
>>>> the group. Then again if you free some blocks from the group, the flags
>>>> gets cleared again ...
>>>> 
>>>> But I don't think this is a problem at all. Certainly not worth tracking
>>>> free/trimmed extents to solve it.
>>> 
>>> it is a problem
>>> 
>>> on a daily "fstrim -av" you trim gigabytes of alredy trimmed blocks
>>> which for example on a vmware thin provisioned vdisk makes it down to
>>> CBT (changed-block-tracking)
>>> 
>>> so instead completly ignore that untouched space thanks to CBT it's
>>> considered as changed and verified in the follow up backup run which
>>> takes magnitutdes longer than needed
>> 
>> Looks like you identified the problem then ;)
> 
> well, in a perfect world.....
> 
>> But seriously, trim/discard was always considered advisory and the
>> storage is completely free to do whatever it wants to do with the
>> information. I might even be the case that the discard requests are
>> ignored and we might not even need optimization like this. But
>> regardless it does take time to go through the block gropus and as a
>> result this optimization is useful in the fs itself.
> 
> luckily at least fstrim is non-blocking in a vmware environment, on my
> physical box it takes ages
> 
> this machine *does nothing* than wait to be cloned, 235 MB pretended
> deleted data within 50 minutes is absurd on a completly idle guest
> 
> so even when i am all in for optimizations thatÄs way over top
> 
> [root@master:~]$ fstrim -av
> /boot: 0 B (0 bytes) trimmed on /dev/sda1
> /: 235.8 MiB (247201792 bytes) trimmed on /dev/sdb1
> 
> [root@master:~]$ df
> Filesystem     Type  Size  Used Avail Use% Mounted on
> /dev/sdb1      ext4  5.8G  502M  5.3G   9% /
> /dev/sda1      ext4  485M   39M  443M   9% /boot


I don't think that this patch will necessarily fix the problem you
are seeing, in the sense that WAS_TRIMMED was previously stored in
the group descriptor in memory, so repeated fstrim runs _shouldn't_
result in the group being trimmed unless it had some blocks freed.
If your image has even one block freed in any group, then fstrim will
result in all of the free blocks in that group being trimmed again.

That said, I think a follow-on optimization would be to limit *clearing*
the WAS_TRIMMED flag until at least some minimum number of blocks have
been freed (e.g. at least 1024 blocks freed, or the group is "empty", or
similar).  That would avoid excess TRIM calls down to the storage when
only a few blocks were freed that would be unlikely to actually result
in an erase blocks being freed.  This size could be dynamic based on
the minimum trim size passed by fstrim (e.g. min(1024 * minblocks, 16MB)).

That would also fit in nicely with changing "-o discard" over to using
per-block group tracking of the trim state, and use the same mechanism
as fstrim, rather than using the per-extent tracking that is used today
and causes (IMHO) too many small trims to the storage to be useful.
Not only do many small trim requests cause overhead during IO, but they
can also miss actual trims because the individual extent are smaller
than the discard size of the storage, and it doesn't combine the trim
range with adjacent free blocks.

Doing the block-group-based trim in the background, when a group has had
a bunch of blocks freed, and when the filesystem is idle (maybe a percpu
counter that is incremented whenever user read/write requests are done,
that can be checked for idleness when there are groups to be trimmed)
doesn't have to be perfect or totally crash proof, since it will always
have another chance to trim the group the next time some blocks are freed,
or if manual "fstrim" is called on the group with a smaller minlen.

Cheers, Andreas
diff mbox series

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ad2dbf6e4924..23c2dc529a28 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -357,6 +357,7 @@  struct flex_groups {
 #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
 #define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
 #define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
+#define EXT4_BG_WAS_TRIMMED	0x0008 /* block group was trimmed */
 
 /*
  * Macro-instructions used to manage group descriptors
@@ -3112,9 +3113,8 @@  struct ext4_group_info {
 };
 
 #define EXT4_GROUP_INFO_NEED_INIT_BIT		0
-#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT		1
-#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT	2
-#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT	3
+#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT	1
+#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT	2
 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT		\
 	(1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
 #define EXT4_GROUP_INFO_IBITMAP_CORRUPT		\
@@ -3127,12 +3127,12 @@  struct ext4_group_info {
 #define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)	\
 	(test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
 
-#define EXT4_MB_GRP_WAS_TRIMMED(grp)	\
-	(test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
-#define EXT4_MB_GRP_SET_TRIMMED(grp)	\
-	(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
-#define EXT4_MB_GRP_CLEAR_TRIMMED(grp)	\
-	(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GDP_WAS_TRIMMED(gdp)	\
+	(gdp->bg_flags & cpu_to_le16(EXT4_BG_WAS_TRIMMED))
+#define EXT4_MB_GDP_SET_TRIMMED(gdp)	\
+	(gdp->bg_flags |= cpu_to_le16(EXT4_BG_WAS_TRIMMED))
+#define EXT4_MB_GDP_CLEAR_TRIMMED(gdp)	\
+	(gdp->bg_flags &= ~cpu_to_le16(EXT4_BG_WAS_TRIMMED))
 
 #define EXT4_MAX_CONTENTION		8
 #define EXT4_CONTENTION_THRESHOLD	2
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 4b9002f0e84c..4094a5b247f7 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -123,7 +123,8 @@ 
 #define EXT4_HT_MOVE_EXTENTS     9
 #define EXT4_HT_XATTR           10
 #define EXT4_HT_EXT_CONVERT     11
-#define EXT4_HT_MAX             12
+#define EXT4_HT_FS_TRIM		12
+#define EXT4_HT_MAX             13
 
 /**
  *   struct ext4_journal_cb_entry - Base structure for callback information.
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 30d5d97548c4..d25377948994 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2829,15 +2829,6 @@  static void ext4_free_data_in_buddy(struct super_block *sb,
 	rb_erase(&entry->efd_node, &(db->bb_free_root));
 	mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
 
-	/*
-	 * Clear the trimmed flag for the group so that the next
-	 * ext4_trim_fs can trim it.
-	 * If the volume is mounted with -o discard, online discard
-	 * is supported and the free blocks will be trimmed online.
-	 */
-	if (!test_opt(sb, DISCARD))
-		EXT4_MB_GRP_CLEAR_TRIMMED(db);
-
 	if (!db->bb_free_root.rb_node) {
 		/* No more items in the per group rb tree
 		 * balance refcounts from ext4_mb_free_metadata()
@@ -4928,8 +4919,7 @@  void ext4_free_blocks(handle_t *handle, struct inode *inode,
 					 " group:%d block:%d count:%lu failed"
 					 " with %d", block_group, bit, count,
 					 err);
-		} else
-			EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
+		}
 
 		ext4_lock_group(sb, block_group);
 		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
@@ -4939,6 +4929,14 @@  void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
 	ext4_free_group_clusters_set(sb, gdp, ret);
 	ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
+	/*
+	 * Clear the trimmed flag for the group so that the next
+	 * ext4_trim_fs can trim it.
+	 * If the volume is mounted with -o discard, online discard
+	 * is supported and the free blocks will be trimmed online.
+	 */
+	if (!test_opt(sb, DISCARD))
+		EXT4_MB_GDP_CLEAR_TRIMMED(gdp);
 	ext4_group_desc_csum_set(sb, block_group, gdp);
 	ext4_unlock_group(sb, block_group);
 
@@ -5192,8 +5190,15 @@  ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
 	ext4_grpblk_t next, count = 0, free_count = 0;
 	struct ext4_buddy e4b;
 	int ret = 0;
+	struct ext4_group_desc *gdp;
+	struct buffer_head *gdp_bh;
 
 	trace_ext4_trim_all_free(sb, group, start, max);
+	gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+	if (!gdp) {
+		ret = -EIO;
+		return ret;
+	}
 
 	ret = ext4_mb_load_buddy(sb, group, &e4b);
 	if (ret) {
@@ -5204,7 +5209,7 @@  ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
 	bitmap = e4b.bd_bitmap;
 
 	ext4_lock_group(sb, group);
-	if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
+	if (EXT4_MB_GDP_WAS_TRIMMED(gdp) &&
 	    minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
 		goto out;
 
@@ -5245,12 +5250,35 @@  ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
 
 	if (!ret) {
 		ret = count;
-		EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+		EXT4_MB_GDP_SET_TRIMMED(gdp);
+		ext4_group_desc_csum_set(sb, group, gdp);
 	}
 out:
 	ext4_unlock_group(sb, group);
 	ext4_mb_unload_buddy(&e4b);
+	if (ret > 0) {
+		int err;
+		handle_t *handle;
 
+		handle = ext4_journal_start_sb(sb, EXT4_HT_FS_TRIM, 1);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out_return;
+		}
+		err = ext4_journal_get_write_access(handle, gdp_bh);
+		if (err) {
+			ret = err;
+			goto out_journal;
+		}
+		err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
+		if (err)
+			ret = err;
+out_journal:
+		err = ext4_journal_stop(handle);
+		if (err)
+			ret = err;
+	}
+out_return:
 	ext4_debug("trimmed %d blocks in the group %d\n",
 		count, group);