diff mbox series

[v2] ext4: fix race condition between ext4_write and ext4_convert_inline_data

Message ID 20220426140658.1046700-1-libaokun1@huawei.com
State Superseded
Headers show
Series [v2] ext4: fix race condition between ext4_write and ext4_convert_inline_data | expand

Commit Message

Baokun Li April 26, 2022, 2:06 p.m. UTC
Hulk Robot reported a BUG_ON:
 ==================================================================
 EXT4-fs error (device loop3): ext4_mb_generate_buddy:805: group 0,
 block bitmap and bg descriptor inconsistent: 25 vs 31513 free clusters
 kernel BUG at fs/ext4/ext4_jbd2.c:53!
 invalid opcode: 0000 [#1] SMP KASAN PTI
 CPU: 0 PID: 25371 Comm: syz-executor.3 Not tainted 5.10.0+ #1
 RIP: 0010:ext4_put_nojournal fs/ext4/ext4_jbd2.c:53 [inline]
 RIP: 0010:__ext4_journal_stop+0x10e/0x110 fs/ext4/ext4_jbd2.c:116
 [...]
 Call Trace:
  ext4_write_inline_data_end+0x59a/0x730 fs/ext4/inline.c:795
  generic_perform_write+0x279/0x3c0 mm/filemap.c:3344
  ext4_buffered_write_iter+0x2e3/0x3d0 fs/ext4/file.c:270
  ext4_file_write_iter+0x30a/0x11c0 fs/ext4/file.c:520
  do_iter_readv_writev+0x339/0x3c0 fs/read_write.c:732
  do_iter_write+0x107/0x430 fs/read_write.c:861
  vfs_writev fs/read_write.c:934 [inline]
  do_pwritev+0x1e5/0x380 fs/read_write.c:1031
 [...]
 ==================================================================

Above issue may happen as follows:
           cpu1                     cpu2
__________________________|__________________________
do_pwritev
 vfs_writev
  do_iter_write
   ext4_file_write_iter
    ext4_buffered_write_iter
     generic_perform_write
      ext4_da_write_begin
                           vfs_fallocate
                            ext4_fallocate
                             ext4_convert_inline_data
                              ext4_convert_inline_data_nolock
                               ext4_destroy_inline_data_nolock
                                clear EXT4_STATE_MAY_INLINE_DATA
                               ext4_map_blocks
                                ext4_ext_map_blocks
                                 ext4_mb_new_blocks
                                  ext4_mb_regular_allocator
                                   ext4_mb_good_group_nolock
                                    ext4_mb_init_group
                                     ext4_mb_init_cache
                                      ext4_mb_generate_buddy  --> error
       ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)
                                ext4_restore_inline_data
                                 set EXT4_STATE_MAY_INLINE_DATA
       ext4_block_write_begin
      ext4_da_write_end
       ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)
       ext4_write_inline_data_end
        handle=NULL
        ext4_journal_stop(handle)
         __ext4_journal_stop
          ext4_put_nojournal(handle)
           ref_cnt = (unsigned long)handle
           BUG_ON(ref_cnt == 0)  ---> BUG_ON

The lock held by ext4_convert_inline_data is xattr_sem, but the lock
held by generic_perform_write is i_rwsem. Therefore, the two locks can
be concurrent. To solve above issue, we just add inode_lock in
ext4_convert_inline_data.

Fixes: 0c8d414f163f ("ext4: let fallocate handle inline data correctly")
Cc: stable@vger.kernel.org
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
---
V1->V2:
	Increase the range of the inode_lock.

 fs/ext4/inline.c | 2 ++
 1 file changed, 2 insertions(+)

Comments

Jan Kara April 27, 2022, 12:31 p.m. UTC | #1
On Tue 26-04-22 22:06:58, Baokun Li wrote:
> Hulk Robot reported a BUG_ON:
>  ==================================================================
>  EXT4-fs error (device loop3): ext4_mb_generate_buddy:805: group 0,
>  block bitmap and bg descriptor inconsistent: 25 vs 31513 free clusters
>  kernel BUG at fs/ext4/ext4_jbd2.c:53!
>  invalid opcode: 0000 [#1] SMP KASAN PTI
>  CPU: 0 PID: 25371 Comm: syz-executor.3 Not tainted 5.10.0+ #1
>  RIP: 0010:ext4_put_nojournal fs/ext4/ext4_jbd2.c:53 [inline]
>  RIP: 0010:__ext4_journal_stop+0x10e/0x110 fs/ext4/ext4_jbd2.c:116
>  [...]
>  Call Trace:
>   ext4_write_inline_data_end+0x59a/0x730 fs/ext4/inline.c:795
>   generic_perform_write+0x279/0x3c0 mm/filemap.c:3344
>   ext4_buffered_write_iter+0x2e3/0x3d0 fs/ext4/file.c:270
>   ext4_file_write_iter+0x30a/0x11c0 fs/ext4/file.c:520
>   do_iter_readv_writev+0x339/0x3c0 fs/read_write.c:732
>   do_iter_write+0x107/0x430 fs/read_write.c:861
>   vfs_writev fs/read_write.c:934 [inline]
>   do_pwritev+0x1e5/0x380 fs/read_write.c:1031
>  [...]
>  ==================================================================
> 
> Above issue may happen as follows:
>            cpu1                     cpu2
> __________________________|__________________________
> do_pwritev
>  vfs_writev
>   do_iter_write
>    ext4_file_write_iter
>     ext4_buffered_write_iter
>      generic_perform_write
>       ext4_da_write_begin
>                            vfs_fallocate
>                             ext4_fallocate
>                              ext4_convert_inline_data
>                               ext4_convert_inline_data_nolock
>                                ext4_destroy_inline_data_nolock
>                                 clear EXT4_STATE_MAY_INLINE_DATA
>                                ext4_map_blocks
>                                 ext4_ext_map_blocks
>                                  ext4_mb_new_blocks
>                                   ext4_mb_regular_allocator
>                                    ext4_mb_good_group_nolock
>                                     ext4_mb_init_group
>                                      ext4_mb_init_cache
>                                       ext4_mb_generate_buddy  --> error
>        ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)
>                                 ext4_restore_inline_data
>                                  set EXT4_STATE_MAY_INLINE_DATA
>        ext4_block_write_begin
>       ext4_da_write_end
>        ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)
>        ext4_write_inline_data_end
>         handle=NULL
>         ext4_journal_stop(handle)
>          __ext4_journal_stop
>           ext4_put_nojournal(handle)
>            ref_cnt = (unsigned long)handle
>            BUG_ON(ref_cnt == 0)  ---> BUG_ON
> 
> The lock held by ext4_convert_inline_data is xattr_sem, but the lock
> held by generic_perform_write is i_rwsem. Therefore, the two locks can
> be concurrent. To solve above issue, we just add inode_lock in
> ext4_convert_inline_data.
> 
> Fixes: 0c8d414f163f ("ext4: let fallocate handle inline data correctly")
> Cc: stable@vger.kernel.org
> Reported-by: Hulk Robot <hulkci@huawei.com>
> Signed-off-by: Baokun Li <libaokun1@huawei.com>

Thanks for the report and the patch. I agree inode_lock is needed to
protect the conversion.  However you cannot lock it inside
ext4_convert_inline_data() because it is often called with inode_lock
already held (in fact in most of the other cases as far as I have checked).
So the right fix is to reorganize ext4_fallocate() a bit to do something
like:

	ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
	if (ext4_has_inline_data(inode)) {
		inode_lock(inode);
		ret = ext4_convert_inline_data(inode);
		inode_unlock(inode);
		if (ret)
			goto exit;
	}

and remove similar handling from ext4_punch_hole() (and make sure it uses
the new handling in ext4_fallocate()).

								Honza
kernel test robot May 3, 2022, 10:28 a.m. UTC | #2
Greeting,

FYI, we noticed the following commit (built with gcc-11):

commit: bc88befa0c58cf6a97b141854fba66eb26c1a8f4 ("[PATCH v2] ext4: fix race condition between ext4_write and ext4_convert_inline_data")
url: https://github.com/intel-lab-lkp/linux/commits/Baokun-Li/ext4-fix-race-condition-between-ext4_write-and-ext4_convert_inline_data/20220426-220149
base: https://git.kernel.org/cgit/linux/kernel/git/tytso/ext4.git dev
patch link: https://lore.kernel.org/linux-ext4/20220426140658.1046700-1-libaokun1@huawei.com

in testcase: xfstests
version: xfstests-x86_64-46e1b83-1_20220414
with following parameters:

	nr_pmem: 4
	fs: ext4
	test: generic-dax
	ucode: 0x700001c

test-description: xfstests is a regression test suite for xfs and other files ystems.
test-url: git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git


on test machine: 16 threads 1 sockets Intel(R) Xeon(R) CPU D-1541 @ 2.10GHz with 48G memory

caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):



If you fix the issue, kindly add following tag
Reported-by: kernel test robot <oliver.sang@intel.com>


[  993.306815][  T112] INFO: task xfs_io:2214 blocked for more than 491 seconds.
[  993.315644][  T112]       Not tainted 5.17.0-rc5-00046-gbc88befa0c58 #1
[  993.323888][  T112] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  993.334067][  T112] task:xfs_io          state:D stack:    0 pid: 2214 ppid:  2213 flags:0x00000000
[  993.344788][  T112] Call Trace:
[  993.349603][  T112]  <TASK>
[ 993.354064][ T112] __schedule (kernel/sched/core.c:4995 kernel/sched/core.c:6304) 
[ 993.359989][ T112] ? io_schedule_timeout (kernel/sched/core.c:6183) 
[ 993.366775][ T112] ? _raw_spin_lock_irq (arch/x86/include/asm/atomic.h:202 include/linux/atomic/atomic-instrumented.h:543 include/asm-generic/qspinlock.h:82 include/linux/spinlock.h:185 include/linux/spinlock_api_smp.h:120 kernel/locking/spinlock.c:170) 
[ 993.373317][ T112] schedule (include/linux/instrumented.h:71 (discriminator 1) include/asm-generic/bitops/instrumented-non-atomic.h:134 (discriminator 1) include/linux/thread_info.h:118 (discriminator 1) include/linux/sched.h:2127 (discriminator 1) kernel/sched/core.c:6379 (discriminator 1)) 
[ 993.378880][ T112] rwsem_down_write_slowpath (arch/x86/include/asm/current.h:15 kernel/locking/rwsem.c:1144) 
[ 993.386003][ T112] ? check_preempt_curr (arch/x86/include/asm/bitops.h:207 (discriminator 1) include/asm-generic/bitops/instrumented-non-atomic.h:135 (discriminator 1) include/linux/thread_info.h:118 (discriminator 1) include/linux/sched.h:1991 (discriminator 1) include/linux/sched.h:2006 (discriminator 1) kernel/sched/core.c:2135 (discriminator 1)) 
[ 993.392684][ T112] ? attach_entity_cfs_rq (arch/x86/include/asm/atomic64_64.h:46 include/linux/atomic/atomic-long.h:53 include/linux/atomic/atomic-instrumented.h:1295 kernel/sched/fair.c:3332 kernel/sched/fair.c:11316) 
[ 993.399523][ T112] ? up_read (kernel/locking/rwsem.c:1053) 
[ 993.405085][ T112] ? __cond_resched (kernel/sched/core.c:8200) 
[ 993.411216][ T112] ? down_write (arch/x86/include/asm/atomic64_64.h:34 include/linux/atomic/atomic-long.h:41 include/linux/atomic/atomic-instrumented.h:1280 kernel/locking/rwsem.c:138 kernel/locking/rwsem.c:255 kernel/locking/rwsem.c:1258 kernel/locking/rwsem.c:1268 kernel/locking/rwsem.c:1515) 
[ 993.417072][ T112] ? ext4_convert_inline_data (arch/x86/include/asm/bitops.h:75 include/asm-generic/bitops/instrumented-atomic.h:42 fs/ext4/ext4.h:1933 fs/ext4/inline.c:2007) 
[ 993.424240][ T112] down_write (kernel/locking/rwsem.c:1516) 
[ 993.430013][ T112] ? down_write_killable (kernel/locking/rwsem.c:1512) 
[ 993.436732][ T112] ext4_fallocate (include/linux/instrumented.h:71 (discriminator 12) include/asm-generic/bitops/instrumented-non-atomic.h:134 (discriminator 12) fs/ext4/ext4.h:1918 (discriminator 12) fs/ext4/extents.c:4730 (discriminator 12)) 
[ 993.442821][ T112] vfs_fallocate (fs/open.c:308) 
[ 993.448801][ T112] __x64_sys_fallocate (include/linux/file.h:45 fs/open.c:332 fs/open.c:339 fs/open.c:337 fs/open.c:337) 
[ 993.455223][ T112] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) 
[ 993.461038][ T112] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113) 
[  993.468293][  T112] RIP: 0033:0x7f391abb246d
[  993.474055][  T112] RSP: 002b:00007ffe20742f10 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[  993.483796][  T112] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f391abb246d
[  993.493068][  T112] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000004
[  993.502334][  T112] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
[  993.511582][  T112] R10: 0000000000100000 R11: 0000000000000293 R12: 0000000000000000
[  993.520862][  T112] R13: 0000000000100000 R14: 000000000252bef0 R15: 0000000000000001
[  993.530120][  T112]  </TASK>
[ 1484.826867][  T112] INFO: task xfs_io:2214 blocked for more than 983 seconds.
[ 1484.835749][  T112]       Not tainted 5.17.0-rc5-00046-gbc88befa0c58 #1
[ 1484.843764][  T112] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 1484.853792][  T112] task:xfs_io          state:D stack:    0 pid: 2214 ppid:  2213 flags:0x00000000
[ 1484.864324][  T112] Call Trace:
[ 1484.868910][  T112]  <TASK>
[ 1484.873163][ T112] __schedule (kernel/sched/core.c:4995 kernel/sched/core.c:6304) 
[ 1484.878929][ T112] ? io_schedule_timeout (kernel/sched/core.c:6183) 
[ 1484.885546][ T112] ? _raw_spin_lock_irq (arch/x86/include/asm/atomic.h:202 include/linux/atomic/atomic-instrumented.h:543 include/asm-generic/qspinlock.h:82 include/linux/spinlock.h:185 include/linux/spinlock_api_smp.h:120 kernel/locking/spinlock.c:170) 
[ 1484.892010][ T112] schedule (include/linux/instrumented.h:71 (discriminator 1) include/asm-generic/bitops/instrumented-non-atomic.h:134 (discriminator 1) include/linux/thread_info.h:118 (discriminator 1) include/linux/sched.h:2127 (discriminator 1) kernel/sched/core.c:6379 (discriminator 1)) 
[ 1484.897384][ T112] rwsem_down_write_slowpath (arch/x86/include/asm/current.h:15 kernel/locking/rwsem.c:1144) 
[ 1484.904336][ T112] ? check_preempt_curr (arch/x86/include/asm/bitops.h:207 (discriminator 1) include/asm-generic/bitops/instrumented-non-atomic.h:135 (discriminator 1) include/linux/thread_info.h:118 (discriminator 1) include/linux/sched.h:1991 (discriminator 1) include/linux/sched.h:2006 (discriminator 1) kernel/sched/core.c:2135 (discriminator 1)) 
[ 1484.910819][ T112] ? attach_entity_cfs_rq (arch/x86/include/asm/atomic64_64.h:46 include/linux/atomic/atomic-long.h:53 include/linux/atomic/atomic-instrumented.h:1295 kernel/sched/fair.c:3332 kernel/sched/fair.c:11316) 
[ 1484.917512][ T112] ? up_read (kernel/locking/rwsem.c:1053) 
[ 1484.922830][ T112] ? __cond_resched (kernel/sched/core.c:8200) 
[ 1484.928787][ T112] ? down_write (arch/x86/include/asm/atomic64_64.h:34 include/linux/atomic/atomic-long.h:41 include/linux/atomic/atomic-instrumented.h:1280 kernel/locking/rwsem.c:138 kernel/locking/rwsem.c:255 kernel/locking/rwsem.c:1258 kernel/locking/rwsem.c:1268 kernel/locking/rwsem.c:1515) 
[ 1484.934453][ T112] ? ext4_convert_inline_data (arch/x86/include/asm/bitops.h:75 include/asm-generic/bitops/instrumented-atomic.h:42 fs/ext4/ext4.h:1933 fs/ext4/inline.c:2007) 
[ 1484.941387][ T112] down_write (kernel/locking/rwsem.c:1516) 
[ 1484.946908][ T112] ? down_write_killable (kernel/locking/rwsem.c:1512) 
[ 1484.953416][ T112] ext4_fallocate (include/linux/instrumented.h:71 (discriminator 12) include/asm-generic/bitops/instrumented-non-atomic.h:134 (discriminator 12) fs/ext4/ext4.h:1918 (discriminator 12) fs/ext4/extents.c:4730 (discriminator 12)) 
[ 1484.959250][ T112] vfs_fallocate (fs/open.c:308) 
[ 1484.964958][ T112] __x64_sys_fallocate (include/linux/file.h:45 fs/open.c:332 fs/open.c:339 fs/open.c:337 fs/open.c:337) 
[ 1484.971154][ T112] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) 
[ 1484.976720][ T112] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113) 
[ 1484.983783][  T112] RIP: 0033:0x7f391abb246d
[ 1484.989416][  T112] RSP: 002b:00007ffe20742f10 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[ 1484.999026][  T112] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f391abb246d
[ 1485.008215][  T112] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000004
[ 1485.017408][  T112] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
[ 1485.026622][  T112] R10: 0000000000100000 R11: 0000000000000293 R12: 0000000000000000
[ 1485.035801][  T112] R13: 0000000000100000 R14: 000000000252bef0 R15: 0000000000000001
[ 1485.045007][  T112]  </TASK>


To reproduce:

        git clone https://github.com/intel/lkp-tests.git
        cd lkp-tests
        sudo bin/lkp install job.yaml           # job file is attached in this email
        bin/lkp split-job --compatible job.yaml # generate the yaml file for lkp run
        sudo bin/lkp run generated-yaml-file

        # if come across any failure that blocks the test,
        # please remove ~/.lkp and /lkp dir to run from a clean state.
diff mbox series

Patch

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 9c076262770d..0518edcfc0e1 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -2002,6 +2002,7 @@  int ext4_convert_inline_data(struct inode *inode)
 	handle_t *handle;
 	struct ext4_iloc iloc;
 
+	inode_lock(inode);
 	if (!ext4_has_inline_data(inode)) {
 		ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
 		return 0;
@@ -2024,6 +2025,7 @@  int ext4_convert_inline_data(struct inode *inode)
 	if (ext4_has_inline_data(inode))
 		error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
 	ext4_write_unlock_xattr(inode, &no_expand);
+	inode_unlock(inode);
 	ext4_journal_stop(handle);
 out_free:
 	brelse(iloc.bh);