diff mbox series

[01/11] ext4: add handling for extended mount options

Message ID 20190722040011.18892-1-harshadshirwadkar@gmail.com
State Superseded
Headers show
Series [01/11] ext4: add handling for extended mount options | expand

Commit Message

harshad shirwadkar July 22, 2019, 4 a.m. UTC
We are running out of mount option bits. This patch adds handling for
using s_mount_opt2 and also adds ability to turn on / off the fast
commit feature. In order to use fast commits, new version e2fsprogs
needs to set the fast feature commit flag. This also makes sure that
we have fast commit compatible e2fsprogs before starting to use the
feature. Mount flag "no_fastcommit", introuced in this patch, can be
passed to disable the feature at mount time.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
 fs/ext4/ext4.h       |  4 ++++
 fs/ext4/super.c      | 27 ++++++++++++++++++++++-----
 include/linux/jbd2.h |  5 ++++-
 3 files changed, 30 insertions(+), 6 deletions(-)

Comments

Andreas Dilger July 22, 2019, 6:15 p.m. UTC | #1
Unless I missed it, this patch series needs a 00/11 email that describes
*what* "fast commit" is, and why we want it.  This should include some
benchmark results, since (I'd assume) that the "fast" part of the feature
name implies a performance improvement?

Cheers, Andreas

> On Jul 21, 2019, at 10:00 PM, Harshad Shirwadkar <harshadshirwadkar@gmail.com> wrote:
> 
> We are running out of mount option bits. This patch adds handling for
> using s_mount_opt2 and also adds ability to turn on / off the fast
> commit feature. In order to use fast commits, new version e2fsprogs
> needs to set the fast feature commit flag. This also makes sure that
> we have fast commit compatible e2fsprogs before starting to use the
> feature. Mount flag "no_fastcommit", introuced in this patch, can be
> passed to disable the feature at mount time.
> 
> Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
> ---
> fs/ext4/ext4.h       |  4 ++++
> fs/ext4/super.c      | 27 ++++++++++++++++++++++-----
> include/linux/jbd2.h |  5 ++++-
> 3 files changed, 30 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index bf660aa7a9e0..becbda38b7db 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1146,6 +1146,8 @@ struct ext4_inode_info {
> #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM	0x00000008 /* User explicitly
> 						specified journal checksum */
> 
> +#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT	0x00000010 /* Journal fast commit */
> +
> #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
> 						~EXT4_MOUNT_##opt
> #define set_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt |= \
> @@ -1643,6 +1645,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
> #define EXT4_FEATURE_COMPAT_RESIZE_INODE	0x0010
> #define EXT4_FEATURE_COMPAT_DIR_INDEX		0x0020
> #define EXT4_FEATURE_COMPAT_SPARSE_SUPER2	0x0200
> +#define EXT4_FEATURE_COMPAT_FAST_COMMIT		0x0400
> 
> #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
> #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
> @@ -1743,6 +1746,7 @@ EXT4_FEATURE_COMPAT_FUNCS(xattr,		EXT_ATTR)
> EXT4_FEATURE_COMPAT_FUNCS(resize_inode,		RESIZE_INODE)
> EXT4_FEATURE_COMPAT_FUNCS(dir_index,		DIR_INDEX)
> EXT4_FEATURE_COMPAT_FUNCS(sparse_super2,	SPARSE_SUPER2)
> +EXT4_FEATURE_COMPAT_FUNCS(fast_commit,		FAST_COMMIT)
> 
> EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super,	SPARSE_SUPER)
> EXT4_FEATURE_RO_COMPAT_FUNCS(large_file,	LARGE_FILE)
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 4079605d437a..e376ac040cce 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1455,6 +1455,7 @@ enum {
> 	Opt_dioread_nolock, Opt_dioread_lock,
> 	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
> 	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
> +	Opt_no_fastcommit
> };
> 
> static const match_table_t tokens = {
> @@ -1537,6 +1538,7 @@ static const match_table_t tokens = {
> 	{Opt_init_itable, "init_itable=%u"},
> 	{Opt_init_itable, "init_itable"},
> 	{Opt_noinit_itable, "noinit_itable"},
> +	{Opt_no_fastcommit, "no_fastcommit"},
> 	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
> 	{Opt_test_dummy_encryption, "test_dummy_encryption"},
> 	{Opt_nombcache, "nombcache"},
> @@ -1659,6 +1661,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)
> #define MOPT_NO_EXT3	0x0200
> #define MOPT_EXT4_ONLY	(MOPT_NO_EXT2 | MOPT_NO_EXT3)
> #define MOPT_STRING	0x0400
> +#define MOPT_2		0x0800
> 
> static const struct mount_opts {
> 	int	token;
> @@ -1751,6 +1754,8 @@ static const struct mount_opts {
> 	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
> 	{Opt_test_dummy_encryption, 0, MOPT_GTE0},
> 	{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
> +	{Opt_no_fastcommit, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
> +	 MOPT_CLEAR | MOPT_2 | MOPT_EXT4_ONLY},
> 	{Opt_err, 0, 0}
> };
> 
> @@ -1858,8 +1863,9 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
> 			set_opt2(sb, EXPLICIT_DELALLOC);
> 		} else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
> 			set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM);
> -		} else
> +		} else if (m->mount_opt) {
> 			return -1;
> +		}
> 	}
> 	if (m->flags & MOPT_CLEAR_ERR)
> 		clear_opt(sb, ERRORS_MASK);
> @@ -2027,10 +2033,17 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
> 			WARN_ON(1);
> 			return -1;
> 		}
> -		if (arg != 0)
> -			sbi->s_mount_opt |= m->mount_opt;
> -		else
> -			sbi->s_mount_opt &= ~m->mount_opt;
> +		if (m->flags & MOPT_2) {
> +			if (arg != 0)
> +				sbi->s_mount_opt2 |= m->mount_opt;
> +			else
> +				sbi->s_mount_opt2 &= ~m->mount_opt;
> +		} else {
> +			if (arg != 0)
> +				sbi->s_mount_opt |= m->mount_opt;
> +			else
> +				sbi->s_mount_opt &= ~m->mount_opt;
> +		}
> 	}
> 	return 1;
> }
> @@ -3733,6 +3746,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> #ifdef CONFIG_EXT4_FS_POSIX_ACL
> 	set_opt(sb, POSIX_ACL);
> #endif
> +	if (ext4_has_feature_fast_commit(sb))
> +		set_opt2(sb, JOURNAL_FAST_COMMIT);
> +
> 	/* don't forget to enable journal_csum when metadata_csum is enabled. */
> 	if (ext4_has_metadata_csum(sb))
> 		set_opt(sb, JOURNAL_CHECKSUM);
> @@ -4334,6 +4350,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> 		sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
> 		clear_opt(sb, JOURNAL_CHECKSUM);
> 		clear_opt(sb, DATA_FLAGS);
> +		clear_opt2(sb, JOURNAL_FAST_COMMIT);
> 		sbi->s_journal = NULL;
> 		needs_recovery = 0;
> 		goto no_journal;
> diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> index df03825ad1a1..b7eed49b8ecd 100644
> --- a/include/linux/jbd2.h
> +++ b/include/linux/jbd2.h
> @@ -288,6 +288,7 @@ typedef struct journal_superblock_s
> #define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT	0x00000004
> #define JBD2_FEATURE_INCOMPAT_CSUM_V2		0x00000008
> #define JBD2_FEATURE_INCOMPAT_CSUM_V3		0x00000010
> +#define JBD2_FEATURE_INCOMPAT_FAST_COMMIT	0x00000020
> 
> /* See "journal feature predicate functions" below */
> 
> @@ -298,7 +299,8 @@ typedef struct journal_superblock_s
> 					JBD2_FEATURE_INCOMPAT_64BIT | \
> 					JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \
> 					JBD2_FEATURE_INCOMPAT_CSUM_V2 | \
> -					JBD2_FEATURE_INCOMPAT_CSUM_V3)
> +					JBD2_FEATURE_INCOMPAT_CSUM_V3 | \
> +					JBD2_FEATURE_INCOMPAT_FAST_COMMIT)
> 
> #ifdef __KERNEL__
> 
> @@ -1235,6 +1237,7 @@ JBD2_FEATURE_INCOMPAT_FUNCS(64bit,		64BIT)
> JBD2_FEATURE_INCOMPAT_FUNCS(async_commit,	ASYNC_COMMIT)
> JBD2_FEATURE_INCOMPAT_FUNCS(csum2,		CSUM_V2)
> JBD2_FEATURE_INCOMPAT_FUNCS(csum3,		CSUM_V3)
> +JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit,	FAST_COMMIT)
> 
> /*
>  * Journal flag definitions
> --
> 2.22.0.657.g960e92d24f-goog
> 


Cheers, Andreas
Theodore Ts'o July 22, 2019, 9:02 p.m. UTC | #2
On Mon, Jul 22, 2019 at 12:15:11PM -0600, Andreas Dilger wrote:
> Unless I missed it, this patch series needs a 00/11 email that describes
> *what* "fast commit" is, and why we want it.  This should include some
> benchmark results, since (I'd assume) that the "fast" part of the feature
> name implies a performance improvement?

For background, it's a simplified version of the scheme proposed by
Park and Shin, in their paper, "iJournaling: Fine-Grained Journaling
for Improving the Latency of Fsync System Call"[1]

[1] https://www.usenix.org/conference/atc17/technical-sessions/presentation/park

I agree we should have a cover letter for this patch series.  Also, we
should add documentation to Documentation/filesystems/journaling.rst
about this feature; what it does, how it works, its basic on-disk
format changes, etc.

The fs/jbd2 layer isn't as well documented as the fs/ext4 code, and
bringing Documentation/filesystems/journaling.rst to the same level as
Documentation/filesystems/ext4/* isn't a fair/reasonable request.  On
the other hand, documenting what is being added by this patch series
is something that I think we should do.

   	     	    	     	    - Ted
harshad shirwadkar July 22, 2019, 9:36 p.m. UTC | #3
Thanks Andreas. Thanks Ted for sharing the original paper link. I'll
submit a patch 00/11 with proposed documentation and cover letter
describing the feature and results from benchmarks.


On Mon, Jul 22, 2019 at 2:02 PM Theodore Y. Ts'o <tytso@mit.edu> wrote:
>
> On Mon, Jul 22, 2019 at 12:15:11PM -0600, Andreas Dilger wrote:
> > Unless I missed it, this patch series needs a 00/11 email that describes
> > *what* "fast commit" is, and why we want it.  This should include some
> > benchmark results, since (I'd assume) that the "fast" part of the feature
> > name implies a performance improvement?
>
> For background, it's a simplified version of the scheme proposed by
> Park and Shin, in their paper, "iJournaling: Fine-Grained Journaling
> for Improving the Latency of Fsync System Call"[1]
>
> [1] https://www.usenix.org/conference/atc17/technical-sessions/presentation/park
>
> I agree we should have a cover letter for this patch series.  Also, we
> should add documentation to Documentation/filesystems/journaling.rst
> about this feature; what it does, how it works, its basic on-disk
> format changes, etc.
>
> The fs/jbd2 layer isn't as well documented as the fs/ext4 code, and
> bringing Documentation/filesystems/journaling.rst to the same level as
> Documentation/filesystems/ext4/* isn't a fair/reasonable request.  On
> the other hand, documenting what is being added by this patch series
> is something that I think we should do.
>
>                                     - Ted
Andreas Dilger July 23, 2019, 9:59 p.m. UTC | #4
On Jul 22, 2019, at 3:02 PM, Theodore Y. Ts'o <tytso@mit.edu> wrote:
> 
> On Mon, Jul 22, 2019 at 12:15:11PM -0600, Andreas Dilger wrote:
>> Unless I missed it, this patch series needs a 00/11 email that describes
>> *what* "fast commit" is, and why we want it.  This should include some
>> benchmark results, since (I'd assume) that the "fast" part of the feature
>> name implies a performance improvement?
> 
> For background, it's a simplified version of the scheme proposed by
> Park and Shin, in their paper, "iJournaling: Fine-Grained Journaling
> for Improving the Latency of Fsync System Call"[1]
> 
> [1] https://www.usenix.org/conference/atc17/technical-sessions/presentation/park
> 
> I agree we should have a cover letter for this patch series.  Also, we
> should add documentation to Documentation/filesystems/journaling.rst
> about this feature; what it does, how it works, its basic on-disk
> format changes, etc.

Thanks for the link, I hadn't read that paper previously.  From reading the
paper, it seems there are some things that should be addressed before the
patch is committed to the tree in order to maintain proper disk format
compatibility:
- the ijournal header shows a 256-byte inode.  In Lustre today (and also
  Samba and other xattr-intensive workloads) 512- or 1024-byte inodes are used
  in order to store more xattrs within the inode, so the size of the inode
  data in the ijournal header needs to match the actual inode size of the
  filesystem and not be a fixed size.  What if the inode size == blocksize?
- the ijournal header also shows a 4-byte inode number.  It would be prudent
  to reserve space for 64-bit inode numbers, or at least have some mechanism
  (flag) to indicate that a 64-bit inode is stored instead of a 32-bit inode.
- if there are many cores in a system, say 96, how much space will be used
  from the journal file by the per-core ijournal?
- what happens if multiple threads are writing to the same file with ijournal
  and per-core ijournal areas?  Will the same inode information be recorded
  in multiple ijournal areas?

Cheers, Andreas
harshad shirwadkar July 24, 2019, 6:03 a.m. UTC | #5
Before I respond to your questions, I would like to explain how fast
commits differ from ijournal in a few key aspects (I will make sure to
explain it in detail in patch 00/11 and documentation):

- Instead of storing extent blocks in a fast commit block, we only
store extents that were modified in a particular fast commit
transaction in tag-length-value format.

- Whenever the fast commit information (inode structure + changed
extents in TLV format) exceeds one block, we fall back to full commit.
Thus at this point, the number of blocks we write per fast commit
transaction is either the total number of files changed (if fast
commit was successfully performed) or the number of blocks that would
be written by a full commit transaction.

- To reduce complexity, there is no support for per-core fast commit areas.

Current design of fast commits is such that we try to perform fast
commits whenever possible but either if it's impossible to record file
system changes by fast commits or if we haven't yet added support for
dealing with a particular type of file system change, we fall back to
full commits. Whenever we later add more features to fast commits, we
probably would need more on-disk format changes for the fast commit
blocks and that would mean we burn feature flags. So, my guess is that
we would need to make a few judgement calls on whether we want to
exclude a few fast commit features, keep the patch series simple and
potentially burn feature flags later OR we save feature flags by
implementing those fast commit features.

On Tue, Jul 23, 2019 at 2:59 PM Andreas Dilger <adilger@dilger.ca> wrote:
>
> On Jul 22, 2019, at 3:02 PM, Theodore Y. Ts'o <tytso@mit.edu> wrote:
> >
> > On Mon, Jul 22, 2019 at 12:15:11PM -0600, Andreas Dilger wrote:
> >> Unless I missed it, this patch series needs a 00/11 email that describes
> >> *what* "fast commit" is, and why we want it.  This should include some
> >> benchmark results, since (I'd assume) that the "fast" part of the feature
> >> name implies a performance improvement?
> >
> > For background, it's a simplified version of the scheme proposed by
> > Park and Shin, in their paper, "iJournaling: Fine-Grained Journaling
> > for Improving the Latency of Fsync System Call"[1]
> >
> > [1] https://www.usenix.org/conference/atc17/technical-sessions/presentation/park
> >
> > I agree we should have a cover letter for this patch series.  Also, we
> > should add documentation to Documentation/filesystems/journaling.rst
> > about this feature; what it does, how it works, its basic on-disk
> > format changes, etc.
>
> Thanks for the link, I hadn't read that paper previously.  From reading the
> paper, it seems there are some things that should be addressed before the
> patch is committed to the tree in order to maintain proper disk format
> compatibility:
> - the ijournal header shows a 256-byte inode.  In Lustre today (and also
>   Samba and other xattr-intensive workloads) 512- or 1024-byte inodes are used
>   in order to store more xattrs within the inode, so the size of the inode
>   data in the ijournal header needs to match the actual inode size of the
>   filesystem and not be a fixed size.  What if the inode size == blocksize?

Okay, I agree. This is one of such questions where we need to decide
whether to exclude this fast commit feature request for now or not. I
think whether or not we actually support 512 or 1024 byte inodes in
this patch series, we definitely shouldn't assume in the fast commit
header that inodes are of a fixed size. I will fix it. Supporting
bigger inodes doesn't sound like it would result in big change in the
patch series. But I would like to know whether you think it's okay to
wait or not.

> - the ijournal header also shows a 4-byte inode number.  It would be prudent
>   to reserve space for 64-bit inode numbers, or at least have some mechanism
>   (flag) to indicate that a 64-bit inode is stored instead of a 32-bit inode.

Noted, will change that.

> - if there are many cores in a system, say 96, how much space will be used
>   from the journal file by the per-core ijournal?
> - what happens if multiple threads are writing to the same file with ijournal
>   and per-core ijournal areas?  Will the same inode information be recorded
>   in multiple ijournal areas?

As mentioned above, at least for now we keep it simple by not having
per-core fast commit areas.

Thanks,
Harshad

>
> Cheers, Andreas
>
>
>
>
>
Darrick Wong July 24, 2019, 6:12 a.m. UTC | #6
On Tue, Jul 23, 2019 at 11:03:54PM -0700, harshad shirwadkar wrote:
> Before I respond to your questions, I would like to explain how fast
> commits differ from ijournal in a few key aspects (I will make sure to
> explain it in detail in patch 00/11 and documentation):

Please do; I hadn't realized there were also journal ondisk format
changes, and these must be recorded in the ext4 disk format
documentation.

> - Instead of storing extent blocks in a fast commit block, we only
> store extents that were modified in a particular fast commit
> transaction in tag-length-value format.
> 
> - Whenever the fast commit information (inode structure + changed
> extents in TLV format) exceeds one block, we fall back to full commit.
> Thus at this point, the number of blocks we write per fast commit
> transaction is either the total number of files changed (if fast
> commit was successfully performed) or the number of blocks that would
> be written by a full commit transaction.
> 
> - To reduce complexity, there is no support for per-core fast commit areas.
> 
> Current design of fast commits is such that we try to perform fast
> commits whenever possible but either if it's impossible to record file
> system changes by fast commits or if we haven't yet added support for
> dealing with a particular type of file system change, we fall back to
> full commits. Whenever we later add more features to fast commits, we
> probably would need more on-disk format changes for the fast commit
> blocks and that would mean we burn feature flags. So, my guess is that
> we would need to make a few judgement calls on whether we want to
> exclude a few fast commit features, keep the patch series simple and
> potentially burn feature flags later OR we save feature flags by
> implementing those fast commit features.

Every feature flag you add doubles the size of the testing matrix.
If I were you I'd only want to test the (fastcommit) and (!fastcommit)
scenarios.

--D

> On Tue, Jul 23, 2019 at 2:59 PM Andreas Dilger <adilger@dilger.ca> wrote:
> >
> > On Jul 22, 2019, at 3:02 PM, Theodore Y. Ts'o <tytso@mit.edu> wrote:
> > >
> > > On Mon, Jul 22, 2019 at 12:15:11PM -0600, Andreas Dilger wrote:
> > >> Unless I missed it, this patch series needs a 00/11 email that describes
> > >> *what* "fast commit" is, and why we want it.  This should include some
> > >> benchmark results, since (I'd assume) that the "fast" part of the feature
> > >> name implies a performance improvement?
> > >
> > > For background, it's a simplified version of the scheme proposed by
> > > Park and Shin, in their paper, "iJournaling: Fine-Grained Journaling
> > > for Improving the Latency of Fsync System Call"[1]
> > >
> > > [1] https://www.usenix.org/conference/atc17/technical-sessions/presentation/park
> > >
> > > I agree we should have a cover letter for this patch series.  Also, we
> > > should add documentation to Documentation/filesystems/journaling.rst
> > > about this feature; what it does, how it works, its basic on-disk
> > > format changes, etc.
> >
> > Thanks for the link, I hadn't read that paper previously.  From reading the
> > paper, it seems there are some things that should be addressed before the
> > patch is committed to the tree in order to maintain proper disk format
> > compatibility:
> > - the ijournal header shows a 256-byte inode.  In Lustre today (and also
> >   Samba and other xattr-intensive workloads) 512- or 1024-byte inodes are used
> >   in order to store more xattrs within the inode, so the size of the inode
> >   data in the ijournal header needs to match the actual inode size of the
> >   filesystem and not be a fixed size.  What if the inode size == blocksize?
> 
> Okay, I agree. This is one of such questions where we need to decide
> whether to exclude this fast commit feature request for now or not. I
> think whether or not we actually support 512 or 1024 byte inodes in
> this patch series, we definitely shouldn't assume in the fast commit
> header that inodes are of a fixed size. I will fix it. Supporting
> bigger inodes doesn't sound like it would result in big change in the
> patch series. But I would like to know whether you think it's okay to
> wait or not.
> 
> > - the ijournal header also shows a 4-byte inode number.  It would be prudent
> >   to reserve space for 64-bit inode numbers, or at least have some mechanism
> >   (flag) to indicate that a 64-bit inode is stored instead of a 32-bit inode.
> 
> Noted, will change that.
> 
> > - if there are many cores in a system, say 96, how much space will be used
> >   from the journal file by the per-core ijournal?
> > - what happens if multiple threads are writing to the same file with ijournal
> >   and per-core ijournal areas?  Will the same inode information be recorded
> >   in multiple ijournal areas?
> 
> As mentioned above, at least for now we keep it simple by not having
> per-core fast commit areas.
> 
> Thanks,
> Harshad
> 
> >
> > Cheers, Andreas
> >
> >
> >
> >
> >
Theodore Ts'o July 24, 2019, 4:07 p.m. UTC | #7
On Tue, Jul 23, 2019 at 11:12:31PM -0700, Darrick J. Wong wrote:
> On Tue, Jul 23, 2019 at 11:03:54PM -0700, harshad shirwadkar wrote:
> > Before I respond to your questions, I would like to explain how fast
> > commits differ from ijournal in a few key aspects (I will make sure to
> > explain it in detail in patch 00/11 and documentation):
> 
> Please do; I hadn't realized there were also journal ondisk format
> changes, and these must be recorded in the ext4 disk format
> documentation.

Actually, the changes are almost entirely in the on-disk journal
layer.  The addition of the feature flag is really a UI issue, and
worth some discussion.

One of the goals was to make it easy to allow kernels which didn't
understand fast commit to be able to mount a file system which had
been cleanly unmounted --- but of course, if the file system needs
recovery, and fast commits are in the journal, we can't allow a fast
commit oblivious kernel (or e2fsck) from trying to replay the journal.

One way to do this would be with a mount option, but that's a bit ugly
--- and a mount option in /etc/fstab will cause a failure if a kernel
which doesn't understand that mount option is booted.

So the basic idea is to have a compat feature which means, "please use
fast commits if present", and then when the file system is mounted on
a fast-commit capable kernel, the incompat feature meaning "we're
using the fast commit feature".  (This is same design pattern used
with the HAS_JOURNAL compat feature and the NEEDS_RECOVERY incompat
feature.)

The next question is whether to use the compat and incompat feature
flags in the jbd2 superblock, or ext4-specific compat flags.  For the
incompat flag, there's no reason not to use the journal incompat flag.
But for the compat flag, we have better infrastructure for setting and
clearing ext4-level compat feature flags.  Aside from that, though,
there's no reason why we couldn't use the s_feature_compat field in
the journal superblock --- in which case, *all* of the on-diks format
changes would purely be on the jbd2 side of the ledger.

> Every feature flag you add doubles the size of the testing matrix.
> If I were you I'd only want to test the (fastcommit) and (!fastcommit)
> scenarios.

Sure, absolutely.  On the other hand, as the saying goes, "there comes
a time in any project where it's time to shoot the engineers and put
the d*mned thing into production".  One of the reasons why we're super
interested in this feature is to claw back the performance hit of
fde872682e17 ("ext4: force inode writes when nfsd calls
commit_metadata").  I fully expect that this feature is going to make
big difference to a number of customer workloads, so there is some
urgency to getting this feature into production.

On the flip side, if we leave some performance wins on the table, it's
absolutely true that it makes it harder to add those optimizations
later, and it increases the testing load, not to mention the forwards
and backwards compatibility issues.  It's an engineering trade-off.

    	      		    	     	     - Ted
Darrick Wong July 24, 2019, 4:56 p.m. UTC | #8
On Wed, Jul 24, 2019 at 12:07:49PM -0400, Theodore Y. Ts'o wrote:
> On Tue, Jul 23, 2019 at 11:12:31PM -0700, Darrick J. Wong wrote:
> > On Tue, Jul 23, 2019 at 11:03:54PM -0700, harshad shirwadkar wrote:
> > > Before I respond to your questions, I would like to explain how fast
> > > commits differ from ijournal in a few key aspects (I will make sure to
> > > explain it in detail in patch 00/11 and documentation):
> > 
> > Please do; I hadn't realized there were also journal ondisk format
> > changes, and these must be recorded in the ext4 disk format
> > documentation.
> 
> Actually, the changes are almost entirely in the on-disk journal
> layer.

I know.

Hmm, just as a reminder -- the ext4 disk format documentation
includes the jbd2 disk format documentation.

> The addition of the feature flag is really a UI issue, and
> worth some discussion.
> 
> One of the goals was to make it easy to allow kernels which didn't
> understand fast commit to be able to mount a file system which had
> been cleanly unmounted --- but of course, if the file system needs
> recovery, and fast commits are in the journal, we can't allow a fast
> commit oblivious kernel (or e2fsck) from trying to replay the journal.

BTW, are there patches to fix e2fsck to replay the factcommit journal?

> One way to do this would be with a mount option, but that's a bit ugly
> --- and a mount option in /etc/fstab will cause a failure if a kernel
> which doesn't understand that mount option is booted.
> 
> So the basic idea is to have a compat feature which means, "please use
> fast commits if present", and then when the file system is mounted on
> a fast-commit capable kernel, the incompat feature meaning "we're
> using the fast commit feature".  (This is same design pattern used
> with the HAS_JOURNAL compat feature and the NEEDS_RECOVERY incompat
> feature.)
> 
> The next question is whether to use the compat and incompat feature
> flags in the jbd2 superblock, or ext4-specific compat flags.  For the
> incompat flag, there's no reason not to use the journal incompat flag.
> But for the compat flag, we have better infrastructure for setting and
> clearing ext4-level compat feature flags.  Aside from that, though,
> there's no reason why we couldn't use the s_feature_compat field in
> the journal superblock --- in which case, *all* of the on-diks format
> changes would purely be on the jbd2 side of the ledger.

Probably better to use the journal compat flag so that the other jbd2
users can take advantage of it ... on the other hand, the only other
user (AFAIK) is ocfs2 and HAH.

> > Every feature flag you add doubles the size of the testing matrix.
> > If I were you I'd only want to test the (fastcommit) and (!fastcommit)
> > scenarios.
> 
> Sure, absolutely.  On the other hand, as the saying goes, "there comes
> a time in any project where it's time to shoot the engineers and put
> the d*mned thing into production".  One of the reasons why we're super
> interested in this feature is to claw back the performance hit of
> fde872682e17 ("ext4: force inode writes when nfsd calls
> commit_metadata").  I fully expect that this feature is going to make
> big difference to a number of customer workloads, so there is some
> urgency to getting this feature into production.
> 
> On the flip side, if we leave some performance wins on the table, it's
> absolutely true that it makes it harder to add those optimizations
> later, and it increases the testing load, not to mention the forwards
> and backwards compatibility issues.  It's an engineering trade-off.

<nod> I just remember hearing you complain about the size of the ext4
testing matrix in the past and figured you would't go for adding
fastcommit in small pieces each with new feature bits.

(I guess you could have a fastcommit_version field that increments every
time you add a new fastcommit journal item to constrain the combinatoric
explosion...)

--D

> 
>     	      		    	     	     - Ted
harshad shirwadkar July 24, 2019, 6:14 p.m. UTC | #9
On Wed, Jul 24, 2019 at 9:56 AM Darrick J. Wong <darrick.wong@oracle.com> wrote:
>
> On Wed, Jul 24, 2019 at 12:07:49PM -0400, Theodore Y. Ts'o wrote:
> > On Tue, Jul 23, 2019 at 11:12:31PM -0700, Darrick J. Wong wrote:
> > > On Tue, Jul 23, 2019 at 11:03:54PM -0700, harshad shirwadkar wrote:
> > > > Before I respond to your questions, I would like to explain how fast
> > > > commits differ from ijournal in a few key aspects (I will make sure to
> > > > explain it in detail in patch 00/11 and documentation):
> > >
> > > Please do; I hadn't realized there were also journal ondisk format
> > > changes, and these must be recorded in the ext4 disk format
> > > documentation.
> >
> > Actually, the changes are almost entirely in the on-disk journal
> > layer.
>
> I know.
>
> Hmm, just as a reminder -- the ext4 disk format documentation
> includes the jbd2 disk format documentation.
>
> > The addition of the feature flag is really a UI issue, and
> > worth some discussion.
> >
> > One of the goals was to make it easy to allow kernels which didn't
> > understand fast commit to be able to mount a file system which had
> > been cleanly unmounted --- but of course, if the file system needs
> > recovery, and fast commits are in the journal, we can't allow a fast
> > commit oblivious kernel (or e2fsck) from trying to replay the journal.
>
> BTW, are there patches to fix e2fsck to replay the factcommit journal?
>
> > One way to do this would be with a mount option, but that's a bit ugly
> > --- and a mount option in /etc/fstab will cause a failure if a kernel
> > which doesn't understand that mount option is booted.
> >
> > So the basic idea is to have a compat feature which means, "please use
> > fast commits if present", and then when the file system is mounted on
> > a fast-commit capable kernel, the incompat feature meaning "we're
> > using the fast commit feature".  (This is same design pattern used
> > with the HAS_JOURNAL compat feature and the NEEDS_RECOVERY incompat
> > feature.)
> >
> > The next question is whether to use the compat and incompat feature
> > flags in the jbd2 superblock, or ext4-specific compat flags.  For the
> > incompat flag, there's no reason not to use the journal incompat flag.
> > But for the compat flag, we have better infrastructure for setting and
> > clearing ext4-level compat feature flags.  Aside from that, though,
> > there's no reason why we couldn't use the s_feature_compat field in
> > the journal superblock --- in which case, *all* of the on-diks format
> > changes would purely be on the jbd2 side of the ledger.
>
> Probably better to use the journal compat flag so that the other jbd2
> users can take advantage of it ... on the other hand, the only other
> user (AFAIK) is ocfs2 and HAH.
>
> > > Every feature flag you add doubles the size of the testing matrix.
> > > If I were you I'd only want to test the (fastcommit) and (!fastcommit)
> > > scenarios.
> >
> > Sure, absolutely.  On the other hand, as the saying goes, "there comes
> > a time in any project where it's time to shoot the engineers and put
> > the d*mned thing into production".  One of the reasons why we're super
> > interested in this feature is to claw back the performance hit of
> > fde872682e17 ("ext4: force inode writes when nfsd calls
> > commit_metadata").  I fully expect that this feature is going to make
> > big difference to a number of customer workloads, so there is some
> > urgency to getting this feature into production.
> >
> > On the flip side, if we leave some performance wins on the table, it's
> > absolutely true that it makes it harder to add those optimizations
> > later, and it increases the testing load, not to mention the forwards
> > and backwards compatibility issues.  It's an engineering trade-off.
>
> <nod> I just remember hearing you complain about the size of the ext4
> testing matrix in the past and figured you would't go for adding
> fastcommit in small pieces each with new feature bits.
>
> (I guess you could have a fastcommit_version field that increments every
> time you add a new fastcommit journal item to constrain the combinatoric
> explosion...)
I agree, I was going to suggest the same. We would probably need to
add this field in all individual fast commit blocks, since we don't
have a fast commit superblock equivalent .. and changing jbd2
superblock is probably too much to ask for I guess.
>
> --D
>
> >
> >                                            - Ted
Theodore Ts'o July 25, 2019, 3:46 a.m. UTC | #10
On Wed, Jul 24, 2019 at 11:14:39AM -0700, harshad shirwadkar wrote:
> I agree, I was going to suggest the same. We would probably need to
> add this field in all individual fast commit blocks, since we don't
> have a fast commit superblock equivalent .. and changing jbd2
> superblock is probably too much to ask for I guess.

There's spare space in the jbd2 superblock.  Just grab a byte from
s_padding2...

					- Ted
Andreas Dilger July 25, 2019, 8:03 p.m. UTC | #11
On Jul 24, 2019, at 12:14 PM, harshad shirwadkar <harshadshirwadkar@gmail.com> wrote:
> 
> On Wed, Jul 24, 2019 at 9:56 AM Darrick J. Wong <darrick.wong@oracle.com> wrote:
>> 
>> (I guess you could have a fastcommit_version field that increments every
>> time you add a new fastcommit journal item to constrain the combinatoric
>> explosion...)
> 
> I agree, I was going to suggest the same. We would probably need to
> add this field in all individual fast commit blocks, since we don't
> have a fast commit superblock equivalent .. and changing jbd2
> superblock is probably too much to ask for I guess.

As a general design rule, whenever you see/think "version number", you
should instead use "feature flags" as is done in the ext2/3/4 superblock.
This doesn't take any more space, and is much more flexible.

This is a _much_ more flexible paradigm for compatibility, and doesn't
require the "version X must support every version <= X" behaviour that
a version number does.  With feature flags you can support feature bit
a+b+c, and if feature B is no longer useful it can be deprecated without
affecting the use of feature A or C.  It also makes it clear that bits
a+b+c are using features A+B+C, while storing "version 7" isn't clear
which feature is in use/needed.

Cheers, Andreas
diff mbox series

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf660aa7a9e0..becbda38b7db 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1146,6 +1146,8 @@  struct ext4_inode_info {
 #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM	0x00000008 /* User explicitly
 						specified journal checksum */
 
+#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT	0x00000010 /* Journal fast commit */
+
 #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
 						~EXT4_MOUNT_##opt
 #define set_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt |= \
@@ -1643,6 +1645,7 @@  static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_COMPAT_RESIZE_INODE	0x0010
 #define EXT4_FEATURE_COMPAT_DIR_INDEX		0x0020
 #define EXT4_FEATURE_COMPAT_SPARSE_SUPER2	0x0200
+#define EXT4_FEATURE_COMPAT_FAST_COMMIT		0x0400
 
 #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
@@ -1743,6 +1746,7 @@  EXT4_FEATURE_COMPAT_FUNCS(xattr,		EXT_ATTR)
 EXT4_FEATURE_COMPAT_FUNCS(resize_inode,		RESIZE_INODE)
 EXT4_FEATURE_COMPAT_FUNCS(dir_index,		DIR_INDEX)
 EXT4_FEATURE_COMPAT_FUNCS(sparse_super2,	SPARSE_SUPER2)
+EXT4_FEATURE_COMPAT_FUNCS(fast_commit,		FAST_COMMIT)
 
 EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super,	SPARSE_SUPER)
 EXT4_FEATURE_RO_COMPAT_FUNCS(large_file,	LARGE_FILE)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4079605d437a..e376ac040cce 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1455,6 +1455,7 @@  enum {
 	Opt_dioread_nolock, Opt_dioread_lock,
 	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
 	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
+	Opt_no_fastcommit
 };
 
 static const match_table_t tokens = {
@@ -1537,6 +1538,7 @@  static const match_table_t tokens = {
 	{Opt_init_itable, "init_itable=%u"},
 	{Opt_init_itable, "init_itable"},
 	{Opt_noinit_itable, "noinit_itable"},
+	{Opt_no_fastcommit, "no_fastcommit"},
 	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
 	{Opt_test_dummy_encryption, "test_dummy_encryption"},
 	{Opt_nombcache, "nombcache"},
@@ -1659,6 +1661,7 @@  static int clear_qf_name(struct super_block *sb, int qtype)
 #define MOPT_NO_EXT3	0x0200
 #define MOPT_EXT4_ONLY	(MOPT_NO_EXT2 | MOPT_NO_EXT3)
 #define MOPT_STRING	0x0400
+#define MOPT_2		0x0800
 
 static const struct mount_opts {
 	int	token;
@@ -1751,6 +1754,8 @@  static const struct mount_opts {
 	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
 	{Opt_test_dummy_encryption, 0, MOPT_GTE0},
 	{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
+	{Opt_no_fastcommit, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
+	 MOPT_CLEAR | MOPT_2 | MOPT_EXT4_ONLY},
 	{Opt_err, 0, 0}
 };
 
@@ -1858,8 +1863,9 @@  static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 			set_opt2(sb, EXPLICIT_DELALLOC);
 		} else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
 			set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM);
-		} else
+		} else if (m->mount_opt) {
 			return -1;
+		}
 	}
 	if (m->flags & MOPT_CLEAR_ERR)
 		clear_opt(sb, ERRORS_MASK);
@@ -2027,10 +2033,17 @@  static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 			WARN_ON(1);
 			return -1;
 		}
-		if (arg != 0)
-			sbi->s_mount_opt |= m->mount_opt;
-		else
-			sbi->s_mount_opt &= ~m->mount_opt;
+		if (m->flags & MOPT_2) {
+			if (arg != 0)
+				sbi->s_mount_opt2 |= m->mount_opt;
+			else
+				sbi->s_mount_opt2 &= ~m->mount_opt;
+		} else {
+			if (arg != 0)
+				sbi->s_mount_opt |= m->mount_opt;
+			else
+				sbi->s_mount_opt &= ~m->mount_opt;
+		}
 	}
 	return 1;
 }
@@ -3733,6 +3746,9 @@  static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	set_opt(sb, POSIX_ACL);
 #endif
+	if (ext4_has_feature_fast_commit(sb))
+		set_opt2(sb, JOURNAL_FAST_COMMIT);
+
 	/* don't forget to enable journal_csum when metadata_csum is enabled. */
 	if (ext4_has_metadata_csum(sb))
 		set_opt(sb, JOURNAL_CHECKSUM);
@@ -4334,6 +4350,7 @@  static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
 		clear_opt(sb, JOURNAL_CHECKSUM);
 		clear_opt(sb, DATA_FLAGS);
+		clear_opt2(sb, JOURNAL_FAST_COMMIT);
 		sbi->s_journal = NULL;
 		needs_recovery = 0;
 		goto no_journal;
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index df03825ad1a1..b7eed49b8ecd 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -288,6 +288,7 @@  typedef struct journal_superblock_s
 #define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT	0x00000004
 #define JBD2_FEATURE_INCOMPAT_CSUM_V2		0x00000008
 #define JBD2_FEATURE_INCOMPAT_CSUM_V3		0x00000010
+#define JBD2_FEATURE_INCOMPAT_FAST_COMMIT	0x00000020
 
 /* See "journal feature predicate functions" below */
 
@@ -298,7 +299,8 @@  typedef struct journal_superblock_s
 					JBD2_FEATURE_INCOMPAT_64BIT | \
 					JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \
 					JBD2_FEATURE_INCOMPAT_CSUM_V2 | \
-					JBD2_FEATURE_INCOMPAT_CSUM_V3)
+					JBD2_FEATURE_INCOMPAT_CSUM_V3 | \
+					JBD2_FEATURE_INCOMPAT_FAST_COMMIT)
 
 #ifdef __KERNEL__
 
@@ -1235,6 +1237,7 @@  JBD2_FEATURE_INCOMPAT_FUNCS(64bit,		64BIT)
 JBD2_FEATURE_INCOMPAT_FUNCS(async_commit,	ASYNC_COMMIT)
 JBD2_FEATURE_INCOMPAT_FUNCS(csum2,		CSUM_V2)
 JBD2_FEATURE_INCOMPAT_FUNCS(csum3,		CSUM_V3)
+JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit,	FAST_COMMIT)
 
 /*
  * Journal flag definitions