diff mbox

[32/32] ext4: add nombcache mount option

Message ID 20170620091435.13560-1-tahsin@google.com
State Superseded, archived
Headers show

Commit Message

Tahsin Erdogan June 20, 2017, 9:14 a.m. UTC
The main purpose of mb cache is to achieve deduplication in
extended attributes. In use cases where opportunity for deduplication
is unlikely, it only adds overhead.

Add a mount option to explicitly turn off mb cache.

Suggested-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 fs/ext4/ext4.h  |  1 +
 fs/ext4/super.c | 33 ++++++++++++++++++++++-----------
 fs/ext4/xattr.c | 55 ++++++++++++++++++++++++++++++++++++-------------------
 3 files changed, 59 insertions(+), 30 deletions(-)

Comments

Andreas Dilger June 21, 2017, 9:02 p.m. UTC | #1
On Jun 20, 2017, at 3:14 AM, Tahsin Erdogan <tahsin@google.com> wrote:
> 
> The main purpose of mb cache is to achieve deduplication in
> extended attributes. In use cases where opportunity for deduplication
> is unlikely, it only adds overhead.
> 
> Add a mount option to explicitly turn off mb cache.
> 
> Suggested-by: Andreas Dilger <adilger@dilger.ca>
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
> fs/ext4/ext4.h  |  1 +
> fs/ext4/super.c | 33 ++++++++++++++++++++++-----------
> fs/ext4/xattr.c | 55 ++++++++++++++++++++++++++++++++++++-------------------
> 3 files changed, 59 insertions(+), 30 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 59e9488c4876..21aca1f87187 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1149,6 +1149,7 @@ struct ext4_inode_info {
> #define EXT4_MOUNT_DIOREAD_NOLOCK	0x400000 /* Enable support for dio read nolocking */
> #define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
> #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
> +#define EXT4_MOUNT_NO_MBCACHE		0x2000000 /* No mbcache usage */

We've been using 0x00001 for EXT4_MOUNT_NO_MBCACHE.

> #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
> #define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
> #define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 4b15bf674d45..4262a1b9b5b2 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1336,7 +1336,7 @@ enum {
> 	Opt_inode_readahead_blks, Opt_journal_ioprio,
> 	Opt_dioread_nolock, Opt_dioread_lock,
> 	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
> -	Opt_max_dir_size_kb, Opt_nojournal_checksum,
> +	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
> };
> 
> static const match_table_t tokens = {
> @@ -1419,6 +1419,7 @@ static const match_table_t tokens = {
> 	{Opt_noinit_itable, "noinit_itable"},
> 	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
> 	{Opt_test_dummy_encryption, "test_dummy_encryption"},
> +	{Opt_nombcache, "nombcache"},

For compatibility, please also add:

	{Opt_nombcache, "no_mbcache"},

> 	{Opt_removed, "check=none"},	/* mount option from ext2/3 */
> 	{Opt_removed, "nocheck"},	/* mount option from ext2/3 */
> 	{Opt_removed, "reservation"},	/* mount option from ext2/3 */
> @@ -1626,6 +1627,7 @@ static const struct mount_opts {
> 	{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
> 	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
> 	{Opt_test_dummy_encryption, 0, MOPT_GTE0},
> +	{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
> 	{Opt_err, 0, 0}
> };
> 
> @@ -4080,19 +4082,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> 	sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
> 
> no_journal:
> -	sbi->s_mb_cache = ext4_xattr_create_cache();
> -	if (!sbi->s_mb_cache) {
> -		ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
> -		goto failed_mount_wq;
> -	}
> -
> -	if (ext4_has_feature_ea_inode(sb)) {
> -		sbi->s_ea_inode_cache = ext4_xattr_create_cache();
> -		if (!sbi->s_ea_inode_cache) {
> +	if (!test_opt(sb, NO_MBCACHE)) {
> +		sbi->s_mb_cache = ext4_xattr_create_cache();

Should be named "s_mb_ea_block_cache" or similar?

> +		if (!sbi->s_mb_cache) {
> 			ext4_msg(sb, KERN_ERR,
> -				 "Failed to create an s_ea_inode_cache");
> +				 "Failed to create xattr block mb_cache");
> 			goto failed_mount_wq;
> 		}
> +
> +		if (ext4_has_feature_ea_inode(sb)) {
> +			sbi->s_ea_inode_cache = ext4_xattr_create_cache();

Should be named "s_mb_ea_inode_cache", or alternately use "s_ea_block_cache"
above?

> +			if (!sbi->s_ea_inode_cache) {
> +				ext4_msg(sb, KERN_ERR,
> +					 "Failed to create ea_inode mb_cache");
> +				goto failed_mount_wq;
> +			}
> +		}
> 	}
> 
> 	if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
> @@ -4989,6 +4994,12 @@ static int ext4_remount(struct super_block *sb, int *flags,
> 		}
> 	}
> 
> +	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) {
> +		ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount");
> +		err = -EINVAL;
> +		goto restore_opts;
> +	}

It appears that this restriction also applies to enabling mbcache during
remount as well, so the message should be updated?  In particular, if
NO_MBCACHE is currently set but isn't passed during the remount then this
will always cause the remount to fail.  That makes it harder for users to
just run something like "mount -o remount,ro".

It seems we are a bit inconsistent with how we handle changing mount options.
In some cases, if an option is set to be enabled but cannot be, it is actually
ignored (e.g. DAX, JOURNAL_CHECKSUM).  In other cases this returns an error.

In this case it is probably easiest to just ignore the change to this option,
as is done with JOURNAL_CHECKSUM and DAX.

> 	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
> 		ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
> 			"dax flag with busy inodes while remounting");
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 9d1b600dc827..65ead540c684 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c

> @@ -1200,11 +1206,13 @@ ext4_xattr_release_block(handle_t *handle,
> 		if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) {
> 			struct mb_cache_entry *ce;

This could go inside the "if (ext4_mb_cache)" block.

> -			ce = mb_cache_entry_get(ext4_mb_cache, hash,
> -						bh->b_blocknr);
> -			if (ce) {
> -				ce->e_reusable = 1;
> -				mb_cache_entry_put(ext4_mb_cache, ce);
> +			if (ext4_mb_cache) {
> +				ce = mb_cache_entry_get(ext4_mb_cache, hash,
> +							bh->b_blocknr);
> +				if (ce) {
> +					ce->e_reusable = 1;
> +					mb_cache_entry_put(ext4_mb_cache, ce);
> +				}
> 			}
> 		}


Cheers, Andreas
Tahsin Erdogan June 21, 2017, 11:46 p.m. UTC | #2
>> +     if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) {
>> +             ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount");
>> +             err = -EINVAL;
>> +             goto restore_opts;
>> +     }
>
> It appears that this restriction also applies to enabling mbcache during
> remount as well, so the message should be updated?  In particular, if
> NO_MBCACHE is currently set but isn't passed during the remount then this
> will always cause the remount to fail.  That makes it harder for users to
> just run something like "mount -o remount,ro".

Remount option handling is a bit strange. If an option is not
specified, it is not automatically cleared in sbi->s_mount_opt. So,
once nombcache option is set in the original mount, it is not possible
to clear it. I don't know whether this option handling behavior in
ext4 is deliberate but because of that, the mount -o remount,ro does
not result in a mount failure. The check above is effectively only to
protect against going from not having nombcache option to having
nombcache option, because the opposite is not possible.

Following shows how "debug" option remains set even though it wasn't
specified during remount:

# mount -o debug /dev/sdb /mnt/sdb
# mount | grep sdb
/dev/sdb on /mnt/sdb type ext4 (rw,debug)
# cat /proc/mounts | grep sdb
/dev/sdb /mnt/sdb ext4 rw,relatime,debug,data=ordered 0 0
# mount -o remount,ro /dev/sdb /mnt/sdb
# mount | grep sdb
/dev/sdb on /mnt/sdb type ext4 (ro)
# cat /proc/mounts | grep sdb
/dev/sdb /mnt/sdb ext4 ro,relatime,debug,data=ordered 0 0
Theodore Ts'o June 22, 2017, 4:42 a.m. UTC | #3
On Wed, Jun 21, 2017 at 04:46:48PM -0700, Tahsin Erdogan wrote:
> Remount option handling is a bit strange. If an option is not
> specified, it is not automatically cleared in sbi->s_mount_opt. So,
> once nombcache option is set in the original mount, it is not possible
> to clear it. I don't know whether this option handling behavior in
> ext4 is deliberate but because of that, the mount -o remount,ro does
> not result in a mount failure. The check above is effectively only to
> protect against going from not having nombcache option to having
> nombcache option, because the opposite is not possible.

The option handling behavior is deliberate, yes.  This is why we have
the mount options "lazytime" and "nolazytime", "discard" and
"nodiscard", "barrier" and "nobarrier", etc.

							 -Ted
diff mbox

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 59e9488c4876..21aca1f87187 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1149,6 +1149,7 @@  struct ext4_inode_info {
 #define EXT4_MOUNT_DIOREAD_NOLOCK	0x400000 /* Enable support for dio read nolocking */
 #define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
+#define EXT4_MOUNT_NO_MBCACHE		0x2000000 /* No mbcache usage */
 #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4b15bf674d45..4262a1b9b5b2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1336,7 +1336,7 @@  enum {
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
 	Opt_dioread_nolock, Opt_dioread_lock,
 	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
-	Opt_max_dir_size_kb, Opt_nojournal_checksum,
+	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
 };
 
 static const match_table_t tokens = {
@@ -1419,6 +1419,7 @@  static const match_table_t tokens = {
 	{Opt_noinit_itable, "noinit_itable"},
 	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
 	{Opt_test_dummy_encryption, "test_dummy_encryption"},
+	{Opt_nombcache, "nombcache"},
 	{Opt_removed, "check=none"},	/* mount option from ext2/3 */
 	{Opt_removed, "nocheck"},	/* mount option from ext2/3 */
 	{Opt_removed, "reservation"},	/* mount option from ext2/3 */
@@ -1626,6 +1627,7 @@  static const struct mount_opts {
 	{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
 	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
 	{Opt_test_dummy_encryption, 0, MOPT_GTE0},
+	{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
 	{Opt_err, 0, 0}
 };
 
@@ -4080,19 +4082,22 @@  static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
 
 no_journal:
-	sbi->s_mb_cache = ext4_xattr_create_cache();
-	if (!sbi->s_mb_cache) {
-		ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
-		goto failed_mount_wq;
-	}
-
-	if (ext4_has_feature_ea_inode(sb)) {
-		sbi->s_ea_inode_cache = ext4_xattr_create_cache();
-		if (!sbi->s_ea_inode_cache) {
+	if (!test_opt(sb, NO_MBCACHE)) {
+		sbi->s_mb_cache = ext4_xattr_create_cache();
+		if (!sbi->s_mb_cache) {
 			ext4_msg(sb, KERN_ERR,
-				 "Failed to create an s_ea_inode_cache");
+				 "Failed to create xattr block mb_cache");
 			goto failed_mount_wq;
 		}
+
+		if (ext4_has_feature_ea_inode(sb)) {
+			sbi->s_ea_inode_cache = ext4_xattr_create_cache();
+			if (!sbi->s_ea_inode_cache) {
+				ext4_msg(sb, KERN_ERR,
+					 "Failed to create ea_inode mb_cache");
+				goto failed_mount_wq;
+			}
+		}
 	}
 
 	if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
@@ -4989,6 +4994,12 @@  static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		}
 	}
 
+	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) {
+		ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount");
+		err = -EINVAL;
+		goto restore_opts;
+	}
+
 	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
 		ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
 			"dax flag with busy inodes while remounting");
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 9d1b600dc827..65ead540c684 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -636,7 +636,6 @@  ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	struct inode *inode = d_inode(dentry);
 	struct buffer_head *bh = NULL;
 	int error;
-	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
 	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
 		  buffer, (long)buffer_size);
@@ -658,7 +657,7 @@  ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 		error = -EFSCORRUPTED;
 		goto cleanup;
 	}
-	ext4_xattr_block_cache_insert(ext4_mb_cache, bh);
+	ext4_xattr_block_cache_insert(EXT4_GET_MB_CACHE(inode), bh);
 	error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
 
 cleanup:
@@ -977,10 +976,13 @@  static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
 			set_nlink(ea_inode, 1);
 			ext4_orphan_del(handle, ea_inode);
 
-			hash = ext4_xattr_inode_get_hash(ea_inode);
-			mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
-					      ea_inode->i_ino,
-					      true /* reusable */);
+			if (ea_inode_cache) {
+				hash = ext4_xattr_inode_get_hash(ea_inode);
+				mb_cache_entry_create(ea_inode_cache,
+						      GFP_NOFS, hash,
+						      ea_inode->i_ino,
+						      true /* reusable */);
+			}
 		}
 	} else {
 		WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
@@ -994,9 +996,11 @@  static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
 			clear_nlink(ea_inode);
 			ext4_orphan_add(handle, ea_inode);
 
-			hash = ext4_xattr_inode_get_hash(ea_inode);
-			mb_cache_entry_delete(ea_inode_cache, hash,
-					      ea_inode->i_ino);
+			if (ea_inode_cache) {
+				hash = ext4_xattr_inode_get_hash(ea_inode);
+				mb_cache_entry_delete(ea_inode_cache, hash,
+						      ea_inode->i_ino);
+			}
 		}
 	}
 
@@ -1180,7 +1184,9 @@  ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		 * This must happen under buffer lock for
 		 * ext4_xattr_block_set() to reliably detect freed block
 		 */
-		mb_cache_entry_delete(ext4_mb_cache, hash, bh->b_blocknr);
+		if (ext4_mb_cache)
+			mb_cache_entry_delete(ext4_mb_cache, hash,
+					      bh->b_blocknr);
 		get_bh(bh);
 		unlock_buffer(bh);
 
@@ -1200,11 +1206,13 @@  ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) {
 			struct mb_cache_entry *ce;
 
-			ce = mb_cache_entry_get(ext4_mb_cache, hash,
-						bh->b_blocknr);
-			if (ce) {
-				ce->e_reusable = 1;
-				mb_cache_entry_put(ext4_mb_cache, ce);
+			if (ext4_mb_cache) {
+				ce = mb_cache_entry_get(ext4_mb_cache, hash,
+							bh->b_blocknr);
+				if (ce) {
+					ce->e_reusable = 1;
+					mb_cache_entry_put(ext4_mb_cache, ce);
+				}
 			}
 		}
 
@@ -1383,6 +1391,9 @@  ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
 	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
 	void *ea_data;
 
+	if (!ea_inode_cache)
+		return NULL;
+
 	ce = mb_cache_entry_find_first(ea_inode_cache, hash);
 	if (!ce)
 		return NULL;
@@ -1453,8 +1464,9 @@  static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
 		return err;
 	}
 
-	mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
-			      ea_inode->i_ino, true /* reusable */);
+	if (EA_INODE_CACHE(inode))
+		mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
+				      ea_inode->i_ino, true /* reusable */);
 
 	*ret_inode = ea_inode;
 	return 0;
@@ -1781,8 +1793,9 @@  ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			 * ext4_xattr_block_set() to reliably detect modified
 			 * block
 			 */
-			mb_cache_entry_delete(ext4_mb_cache, hash,
-					      bs->bh->b_blocknr);
+			if (ext4_mb_cache)
+				mb_cache_entry_delete(ext4_mb_cache, hash,
+						      bs->bh->b_blocknr);
 			ea_bdebug(bs->bh, "modifying in-place");
 			error = ext4_xattr_set_entry(i, s, handle, inode,
 						     true /* is_block */);
@@ -2871,6 +2884,8 @@  ext4_xattr_block_cache_insert(struct mb_cache *ext4_mb_cache,
 		       EXT4_XATTR_REFCOUNT_MAX;
 	int error;
 
+	if (!ext4_mb_cache)
+		return;
 	error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash,
 				      bh->b_blocknr, reusable);
 	if (error) {
@@ -2937,6 +2952,8 @@  ext4_xattr_block_cache_find(struct inode *inode,
 	struct mb_cache_entry *ce;
 	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
+	if (!ext4_mb_cache)
+		return NULL;
 	if (!header->h_hash)
 		return NULL;  /* never share */
 	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);