Message ID | 1457922900-30367-2-git-send-email-daeho.jeong@samsung.com |
---|---|
State | New, archived |
Headers | show |
On Mon 14-03-16 11:34:59, Daeho Jeong wrote: > Now, in ext4, there is a race condition between changing inode journal > mode and ext4_writepages(). While ext4_writepages() is executed on > a non-journalled mode inode, the inode's journal mode could be enabled > by ioctl() and then, some pages dirtied after switching the journal > mode will be still exposed to ext4_writepages() in non-journaled mode. > To resolve this problem, we use fs-wide per-cpu rw semaphore by > Jan Kara's suggestion because we don't want to waste ext4_inode_info's > space for this extra rare case. > > Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com> > Signed-off-by: Jan Kara <jack@suse.cz> Yeah, this patch is fine now. Thanks! Honza > --- > fs/ext4/ext4.h | 4 ++++ > fs/ext4/inode.c | 15 ++++++++++++--- > fs/ext4/super.c | 4 ++++ > kernel/locking/percpu-rwsem.c | 1 + > 4 files changed, 21 insertions(+), 3 deletions(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 157b458..c757a3d 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -33,6 +33,7 @@ > #include <linux/ratelimit.h> > #include <crypto/hash.h> > #include <linux/falloc.h> > +#include <linux/percpu-rwsem.h> > #ifdef __KERNEL__ > #include <linux/compat.h> > #endif > @@ -1475,6 +1476,9 @@ struct ext4_sb_info { > struct ratelimit_state s_err_ratelimit_state; > struct ratelimit_state s_warning_ratelimit_state; > struct ratelimit_state s_msg_ratelimit_state; > + > + /* Barrier between changing inodes' journal flags and writepages ops. */ > + struct percpu_rw_semaphore s_journal_flag_rwsem; > }; > > static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index 71fab4c..4f45f24 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -2476,11 +2476,14 @@ static int ext4_writepages(struct address_space *mapping, > struct blk_plug plug; > bool give_up_on_write = false; > > + percpu_down_read(&sbi->s_journal_flag_rwsem); > trace_ext4_writepages(inode, wbc); > > - if (dax_mapping(mapping)) > - return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, > - wbc); > + if (dax_mapping(mapping)) { > + ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, > + wbc); > + goto out_writepages; > + } > > /* > * No pages to write? This is mainly a kludge to avoid starting > @@ -2650,6 +2653,7 @@ retry: > out_writepages: > trace_ext4_writepages_result(inode, wbc, ret, > nr_to_write - wbc->nr_to_write); > + percpu_up_read(&sbi->s_journal_flag_rwsem); > return ret; > } > > @@ -5366,6 +5370,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) > journal_t *journal; > handle_t *handle; > int err; > + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); > > /* > * We have to be very careful here: changing a data block's > @@ -5405,6 +5410,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) > } > } > > + percpu_down_write(&sbi->s_journal_flag_rwsem); > jbd2_journal_lock_updates(journal); > > /* > @@ -5421,6 +5427,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) > err = jbd2_journal_flush(journal); > if (err < 0) { > jbd2_journal_unlock_updates(journal); > + percpu_up_write(&sbi->s_journal_flag_rwsem); > ext4_inode_resume_unlocked_dio(inode); > return err; > } > @@ -5429,6 +5436,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) > ext4_set_aops(inode); > > jbd2_journal_unlock_updates(journal); > + percpu_up_write(&sbi->s_journal_flag_rwsem); > + > if (val) > up_write(&EXT4_I(inode)->i_mmap_sem); > ext4_inode_resume_unlocked_dio(inode); > diff --git a/fs/ext4/super.c b/fs/ext4/super.c > index 3ed01ec..a12950d 100644 > --- a/fs/ext4/super.c > +++ b/fs/ext4/super.c > @@ -861,6 +861,7 @@ static void ext4_put_super(struct super_block *sb) > percpu_counter_destroy(&sbi->s_freeinodes_counter); > percpu_counter_destroy(&sbi->s_dirs_counter); > percpu_counter_destroy(&sbi->s_dirtyclusters_counter); > + percpu_free_rwsem(&sbi->s_journal_flag_rwsem); > brelse(sbi->s_sbh); > #ifdef CONFIG_QUOTA > for (i = 0; i < EXT4_MAXQUOTAS; i++) > @@ -3926,6 +3927,9 @@ no_journal: > if (!err) > err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, > GFP_KERNEL); > + if (!err) > + err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem); > + > if (err) { > ext4_msg(sb, KERN_ERR, "insufficient memory"); > goto failed_mount6; > diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c > index f231e0b..bec0b64 100644 > --- a/kernel/locking/percpu-rwsem.c > +++ b/kernel/locking/percpu-rwsem.c > @@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw) > free_percpu(brw->fast_read_ctr); > brw->fast_read_ctr = NULL; /* catch use after free bugs */ > } > +EXPORT_SYMBOL_GPL(percpu_free_rwsem); > > /* > * This is the fast-path for down_read/up_read. If it succeeds we rely > -- > 1.7.9.5 >
On Mon, Mar 14, 2016 at 11:34:59AM +0900, Daeho Jeong wrote: > Now, in ext4, there is a race condition between changing inode journal > mode and ext4_writepages(). While ext4_writepages() is executed on > a non-journalled mode inode, the inode's journal mode could be enabled > by ioctl() and then, some pages dirtied after switching the journal > mode will be still exposed to ext4_writepages() in non-journaled mode. > To resolve this problem, we use fs-wide per-cpu rw semaphore by > Jan Kara's suggestion because we don't want to waste ext4_inode_info's > space for this extra rare case. > > Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com> > Signed-off-by: Jan Kara <jack@suse.cz> Hi Daeho, This patch is causing a regression for xfstests generic/231 when using a 1k block size (this case tests what happens when the blocksize < page size, such as if you are using a 4k blocksize on a Power PC system). If you have kvm-xfstests or gce-xfstests[1] installed you can reproduce it like this: gce-xfstests -c 1k -C 10 generic/231 (Or use kvm-xfstests if you don't want to get a GCE account). [1] http://thunk.org/gce-xfstests You will get this: BEGIN TEST 1k: Ext4 1k block Wed Apr 27 00:48:01 EDT 2016 DEVICE: /dev/mapper/xt-vdd MK2FS OPTIONS: -q -b 1024 MOUNT OPTIONS: -o block_validity FSTYP -- ext4 PLATFORM -- Linux/x86_64 xfstests-201604270046 4.6.0-rc4-ext4-00010-geed525e MKFS_OPTIONS -- -q -b 1024 /dev/mapper/xt-vdc MOUNT_OPTIONS -- -o acl,user_xattr -o block_validity /dev/mapper/xt-vdc /xt-vdc generic/231 64s ... [00:48:03] [00:49:44] [failed, exit status 1] - output mismatch (see /results/results-1k/generic/231.out.bad) --- tests/generic/231.out 2016-04-27 00:15:36.000000000 -0400 +++ /results/results-1k/generic/231.out.bad 2016-04-27 00:49:44.701895666 -0400 @@ -4,13 +4,10041 @@ Comparing user usage Comparing group usage === FSX Standard Mode, Memory Mapping, 4 Tasks === -All 20000 operations completed A-OK! -All 20000 operations completed A-OK! -All 20000 operations completed A-OK! -All 20000 operations completed A-OK! ... (Run 'diff -u tests/generic/231.out /results/results-1k/generic/231.out.bad' to see the entire diff) Ran: generic/231 Failures: generic/231 Failed 1 of 1 tests I'm going to drop patch 2 and 3 of your patch series from the ext4 tree for now. Could you take a closer look? Thanks, - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 157b458..c757a3d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -33,6 +33,7 @@ #include <linux/ratelimit.h> #include <crypto/hash.h> #include <linux/falloc.h> +#include <linux/percpu-rwsem.h> #ifdef __KERNEL__ #include <linux/compat.h> #endif @@ -1475,6 +1476,9 @@ struct ext4_sb_info { struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; + + /* Barrier between changing inodes' journal flags and writepages ops. */ + struct percpu_rw_semaphore s_journal_flag_rwsem; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 71fab4c..4f45f24 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2476,11 +2476,14 @@ static int ext4_writepages(struct address_space *mapping, struct blk_plug plug; bool give_up_on_write = false; + percpu_down_read(&sbi->s_journal_flag_rwsem); trace_ext4_writepages(inode, wbc); - if (dax_mapping(mapping)) - return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, - wbc); + if (dax_mapping(mapping)) { + ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, + wbc); + goto out_writepages; + } /* * No pages to write? This is mainly a kludge to avoid starting @@ -2650,6 +2653,7 @@ retry: out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); + percpu_up_read(&sbi->s_journal_flag_rwsem); return ret; } @@ -5366,6 +5370,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) journal_t *journal; handle_t *handle; int err; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); /* * We have to be very careful here: changing a data block's @@ -5405,6 +5410,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) } } + percpu_down_write(&sbi->s_journal_flag_rwsem); jbd2_journal_lock_updates(journal); /* @@ -5421,6 +5427,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) err = jbd2_journal_flush(journal); if (err < 0) { jbd2_journal_unlock_updates(journal); + percpu_up_write(&sbi->s_journal_flag_rwsem); ext4_inode_resume_unlocked_dio(inode); return err; } @@ -5429,6 +5436,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); + percpu_up_write(&sbi->s_journal_flag_rwsem); + if (val) up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3ed01ec..a12950d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -861,6 +861,7 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_free_rwsem(&sbi->s_journal_flag_rwsem); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) @@ -3926,6 +3927,9 @@ no_journal: if (!err) err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, GFP_KERNEL); + if (!err) + err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem); + if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount6; diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f231e0b..bec0b64 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw) free_percpu(brw->fast_read_ctr); brw->fast_read_ctr = NULL; /* catch use after free bugs */ } +EXPORT_SYMBOL_GPL(percpu_free_rwsem); /* * This is the fast-path for down_read/up_read. If it succeeds we rely