diff mbox

[v4,2/3] ext4: fix races between changing inode journal mode and ext4_writepages

Message ID 1457922900-30367-2-git-send-email-daeho.jeong@samsung.com
State New, archived
Headers show

Commit Message

Daeho Jeong March 14, 2016, 2:34 a.m. UTC
Now, in ext4, there is a race condition between changing inode journal
mode and ext4_writepages(). While ext4_writepages() is executed on
a non-journalled mode inode, the inode's journal mode could be enabled
by ioctl() and then, some pages dirtied after switching the journal
mode will be still exposed to ext4_writepages() in non-journaled mode.
To resolve this problem, we use fs-wide per-cpu rw semaphore by
Jan Kara's suggestion because we don't want to waste ext4_inode_info's
space for this extra rare case.

Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/ext4.h                |    4 ++++
 fs/ext4/inode.c               |   15 ++++++++++++---
 fs/ext4/super.c               |    4 ++++
 kernel/locking/percpu-rwsem.c |    1 +
 4 files changed, 21 insertions(+), 3 deletions(-)

Comments

Jan Kara March 14, 2016, 7:43 a.m. UTC | #1
On Mon 14-03-16 11:34:59, Daeho Jeong wrote:
> Now, in ext4, there is a race condition between changing inode journal
> mode and ext4_writepages(). While ext4_writepages() is executed on
> a non-journalled mode inode, the inode's journal mode could be enabled
> by ioctl() and then, some pages dirtied after switching the journal
> mode will be still exposed to ext4_writepages() in non-journaled mode.
> To resolve this problem, we use fs-wide per-cpu rw semaphore by
> Jan Kara's suggestion because we don't want to waste ext4_inode_info's
> space for this extra rare case.
> 
> Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com>
> Signed-off-by: Jan Kara <jack@suse.cz>

Yeah, this patch is fine now. Thanks!

								Honza

> ---
>  fs/ext4/ext4.h                |    4 ++++
>  fs/ext4/inode.c               |   15 ++++++++++++---
>  fs/ext4/super.c               |    4 ++++
>  kernel/locking/percpu-rwsem.c |    1 +
>  4 files changed, 21 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 157b458..c757a3d 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -33,6 +33,7 @@
>  #include <linux/ratelimit.h>
>  #include <crypto/hash.h>
>  #include <linux/falloc.h>
> +#include <linux/percpu-rwsem.h>
>  #ifdef __KERNEL__
>  #include <linux/compat.h>
>  #endif
> @@ -1475,6 +1476,9 @@ struct ext4_sb_info {
>  	struct ratelimit_state s_err_ratelimit_state;
>  	struct ratelimit_state s_warning_ratelimit_state;
>  	struct ratelimit_state s_msg_ratelimit_state;
> +
> +	/* Barrier between changing inodes' journal flags and writepages ops. */
> +	struct percpu_rw_semaphore s_journal_flag_rwsem;
>  };
>  
>  static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 71fab4c..4f45f24 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -2476,11 +2476,14 @@ static int ext4_writepages(struct address_space *mapping,
>  	struct blk_plug plug;
>  	bool give_up_on_write = false;
>  
> +	percpu_down_read(&sbi->s_journal_flag_rwsem);
>  	trace_ext4_writepages(inode, wbc);
>  
> -	if (dax_mapping(mapping))
> -		return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
> -						   wbc);
> +	if (dax_mapping(mapping)) {
> +		ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
> +						  wbc);
> +		goto out_writepages;
> +	}
>  
>  	/*
>  	 * No pages to write? This is mainly a kludge to avoid starting
> @@ -2650,6 +2653,7 @@ retry:
>  out_writepages:
>  	trace_ext4_writepages_result(inode, wbc, ret,
>  				     nr_to_write - wbc->nr_to_write);
> +	percpu_up_read(&sbi->s_journal_flag_rwsem);
>  	return ret;
>  }
>  
> @@ -5366,6 +5370,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
>  	journal_t *journal;
>  	handle_t *handle;
>  	int err;
> +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>  
>  	/*
>  	 * We have to be very careful here: changing a data block's
> @@ -5405,6 +5410,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
>  		}
>  	}
>  
> +	percpu_down_write(&sbi->s_journal_flag_rwsem);
>  	jbd2_journal_lock_updates(journal);
>  
>  	/*
> @@ -5421,6 +5427,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
>  		err = jbd2_journal_flush(journal);
>  		if (err < 0) {
>  			jbd2_journal_unlock_updates(journal);
> +			percpu_up_write(&sbi->s_journal_flag_rwsem);
>  			ext4_inode_resume_unlocked_dio(inode);
>  			return err;
>  		}
> @@ -5429,6 +5436,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
>  	ext4_set_aops(inode);
>  
>  	jbd2_journal_unlock_updates(journal);
> +	percpu_up_write(&sbi->s_journal_flag_rwsem);
> +
>  	if (val)
>  		up_write(&EXT4_I(inode)->i_mmap_sem);
>  	ext4_inode_resume_unlocked_dio(inode);
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 3ed01ec..a12950d 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -861,6 +861,7 @@ static void ext4_put_super(struct super_block *sb)
>  	percpu_counter_destroy(&sbi->s_freeinodes_counter);
>  	percpu_counter_destroy(&sbi->s_dirs_counter);
>  	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
> +	percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
>  	brelse(sbi->s_sbh);
>  #ifdef CONFIG_QUOTA
>  	for (i = 0; i < EXT4_MAXQUOTAS; i++)
> @@ -3926,6 +3927,9 @@ no_journal:
>  	if (!err)
>  		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
>  					  GFP_KERNEL);
> +	if (!err)
> +		err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
> +
>  	if (err) {
>  		ext4_msg(sb, KERN_ERR, "insufficient memory");
>  		goto failed_mount6;
> diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
> index f231e0b..bec0b64 100644
> --- a/kernel/locking/percpu-rwsem.c
> +++ b/kernel/locking/percpu-rwsem.c
> @@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
>  	free_percpu(brw->fast_read_ctr);
>  	brw->fast_read_ctr = NULL; /* catch use after free bugs */
>  }
> +EXPORT_SYMBOL_GPL(percpu_free_rwsem);
>  
>  /*
>   * This is the fast-path for down_read/up_read. If it succeeds we rely
> -- 
> 1.7.9.5
>
Theodore Ts'o April 27, 2016, 5:05 a.m. UTC | #2
On Mon, Mar 14, 2016 at 11:34:59AM +0900, Daeho Jeong wrote:
> Now, in ext4, there is a race condition between changing inode journal
> mode and ext4_writepages(). While ext4_writepages() is executed on
> a non-journalled mode inode, the inode's journal mode could be enabled
> by ioctl() and then, some pages dirtied after switching the journal
> mode will be still exposed to ext4_writepages() in non-journaled mode.
> To resolve this problem, we use fs-wide per-cpu rw semaphore by
> Jan Kara's suggestion because we don't want to waste ext4_inode_info's
> space for this extra rare case.
> 
> Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com>
> Signed-off-by: Jan Kara <jack@suse.cz>

Hi Daeho,

This patch is causing a regression for xfstests generic/231 when using
a 1k block size (this case tests what happens when the blocksize <
page size, such as if you are using a 4k blocksize on a Power PC
system).

If you have kvm-xfstests or gce-xfstests[1] installed you can
reproduce it like this:

	gce-xfstests -c 1k -C 10 generic/231

(Or use kvm-xfstests if you don't want to get a GCE account).

[1] http://thunk.org/gce-xfstests

You will get this:

BEGIN TEST 1k: Ext4 1k block Wed Apr 27 00:48:01 EDT 2016
DEVICE: /dev/mapper/xt-vdd
MK2FS OPTIONS: -q -b 1024
MOUNT OPTIONS: -o block_validity
FSTYP         -- ext4
PLATFORM      -- Linux/x86_64 xfstests-201604270046 4.6.0-rc4-ext4-00010-geed525e
MKFS_OPTIONS  -- -q -b 1024 /dev/mapper/xt-vdc
MOUNT_OPTIONS -- -o acl,user_xattr -o block_validity /dev/mapper/xt-vdc /xt-vdc

generic/231 64s ...	[00:48:03] [00:49:44] [failed, exit status 1] - output mismatch (see /results/results-1k/generic/231.out.bad)
    --- tests/generic/231.out	2016-04-27 00:15:36.000000000 -0400
    +++ /results/results-1k/generic/231.out.bad	2016-04-27 00:49:44.701895666 -0400
    @@ -4,13 +4,10041 @@
     Comparing user usage
     Comparing group usage
     === FSX Standard Mode, Memory Mapping, 4 Tasks ===
    -All 20000 operations completed A-OK!
    -All 20000 operations completed A-OK!
    -All 20000 operations completed A-OK!
    -All 20000 operations completed A-OK!
    ...
    (Run 'diff -u tests/generic/231.out /results/results-1k/generic/231.out.bad'  to see the entire diff)
Ran: generic/231
Failures: generic/231
Failed 1 of 1 tests

I'm going to drop patch 2 and 3 of your patch series from the ext4
tree for now.   Could you take a closer look?

Thanks,

						- Ted
						
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 157b458..c757a3d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -33,6 +33,7 @@ 
 #include <linux/ratelimit.h>
 #include <crypto/hash.h>
 #include <linux/falloc.h>
+#include <linux/percpu-rwsem.h>
 #ifdef __KERNEL__
 #include <linux/compat.h>
 #endif
@@ -1475,6 +1476,9 @@  struct ext4_sb_info {
 	struct ratelimit_state s_err_ratelimit_state;
 	struct ratelimit_state s_warning_ratelimit_state;
 	struct ratelimit_state s_msg_ratelimit_state;
+
+	/* Barrier between changing inodes' journal flags and writepages ops. */
+	struct percpu_rw_semaphore s_journal_flag_rwsem;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 71fab4c..4f45f24 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2476,11 +2476,14 @@  static int ext4_writepages(struct address_space *mapping,
 	struct blk_plug plug;
 	bool give_up_on_write = false;
 
+	percpu_down_read(&sbi->s_journal_flag_rwsem);
 	trace_ext4_writepages(inode, wbc);
 
-	if (dax_mapping(mapping))
-		return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
-						   wbc);
+	if (dax_mapping(mapping)) {
+		ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
+						  wbc);
+		goto out_writepages;
+	}
 
 	/*
 	 * No pages to write? This is mainly a kludge to avoid starting
@@ -2650,6 +2653,7 @@  retry:
 out_writepages:
 	trace_ext4_writepages_result(inode, wbc, ret,
 				     nr_to_write - wbc->nr_to_write);
+	percpu_up_read(&sbi->s_journal_flag_rwsem);
 	return ret;
 }
 
@@ -5366,6 +5370,7 @@  int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	journal_t *journal;
 	handle_t *handle;
 	int err;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
 	/*
 	 * We have to be very careful here: changing a data block's
@@ -5405,6 +5410,7 @@  int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		}
 	}
 
+	percpu_down_write(&sbi->s_journal_flag_rwsem);
 	jbd2_journal_lock_updates(journal);
 
 	/*
@@ -5421,6 +5427,7 @@  int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		err = jbd2_journal_flush(journal);
 		if (err < 0) {
 			jbd2_journal_unlock_updates(journal);
+			percpu_up_write(&sbi->s_journal_flag_rwsem);
 			ext4_inode_resume_unlocked_dio(inode);
 			return err;
 		}
@@ -5429,6 +5436,8 @@  int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	ext4_set_aops(inode);
 
 	jbd2_journal_unlock_updates(journal);
+	percpu_up_write(&sbi->s_journal_flag_rwsem);
+
 	if (val)
 		up_write(&EXT4_I(inode)->i_mmap_sem);
 	ext4_inode_resume_unlocked_dio(inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3ed01ec..a12950d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -861,6 +861,7 @@  static void ext4_put_super(struct super_block *sb)
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+	percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
 	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < EXT4_MAXQUOTAS; i++)
@@ -3926,6 +3927,9 @@  no_journal:
 	if (!err)
 		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
 					  GFP_KERNEL);
+	if (!err)
+		err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
+
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "insufficient memory");
 		goto failed_mount6;
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f231e0b..bec0b64 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -37,6 +37,7 @@  void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
 	free_percpu(brw->fast_read_ctr);
 	brw->fast_read_ctr = NULL; /* catch use after free bugs */
 }
+EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
 /*
  * This is the fast-path for down_read/up_read. If it succeeds we rely