diff mbox series

[v4,13/19] ext4: call ext4_mb_mark_group_bb in ext4_free_blocks_simple

Message ID 20230603150327.3596033-14-shikemeng@huaweicloud.com
State Awaiting Upstream
Headers show
Series Fixes, cleanups and unit test for mballoc | expand

Commit Message

Kemeng Shi June 3, 2023, 3:03 p.m. UTC
call ext4_mb_mark_group_bb in ext4_free_blocks_simple to:
1. remove repeat code
2. pair update of free_clusters in ext4_mb_new_blocks_simple.
3. add missing ext4_lock_group/ext4_unlock_group protection.

Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
---
 fs/ext4/mballoc.c | 39 +++++++--------------------------------
 1 file changed, 7 insertions(+), 32 deletions(-)

Comments

Theodore Ts'o June 11, 2023, 5:05 a.m. UTC | #1
On Sat, Jun 03, 2023 at 11:03:21PM +0800, Kemeng Shi wrote:
> call ext4_mb_mark_group_bb in ext4_free_blocks_simple to:
> 1. remove repeat code
> 2. pair update of free_clusters in ext4_mb_new_blocks_simple.
> 3. add missing ext4_lock_group/ext4_unlock_group protection.
> 
> Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
> Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>

Note: after bisecting, I've found that this commit is causing a OOPS
when running "kvm-xfstests -c ext4/adv generic/468".  It appears to be
an issue with the fast commit feature not playing nice with this
patch.  The stack trace looks like this:

[    7.409663] ------------[ cut here ]------------
[    7.409969] WARNING: CPU: 0 PID: 3069 at fs/ext4/mballoc.c:3801 ext4_mb_mark_group_bb+0x48e/0x4a0
[    7.410480] CPU: 0 PID: 3069 Comm: mount Not tainted 6.4.0-rc5-xfstests-lockdep-00021-g60ba685c5998 #146
[    7.411067] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[    7.411639] RIP: 0010:ext4_mb_mark_group_bb+0x48e/0x4a0
[    7.411968] Code: 48 c7 c7 35 b0 88 82 c6 05 16 f4 9b 01 01 e8 f9 16 c9 ff e9 7f fe ff ff 8b 45 08 c7 44 24 10 00 00 00 00 31 c9 e9 ef fc ff ff <0f> 0b e9 76 fc ff ff e8 96 64 b6 00 66 0f 1f 44 00 00 90 90 90 90
[    7.413128] RSP: 0018:ffffc90003b0f9f8 EFLAGS: 00010246
[    7.413458] RAX: 0000000000000003 RBX: 0000000000006002 RCX: 0000000000000001
[    7.413902] RDX: ffff88800965b000 RSI: 0000000000000000 RDI: ffff88800d690100
[    7.414346] RBP: ffffc90003b0fa68 R08: 000000000aebbd6e R09: 0000000000000246
[    7.414791] R10: 00000000d148c994 R11: 00000000941da2bb R12: ffff88800d7fd000
[    7.415234] R13: 0000000000000000 R14: ffff88800f3e4080 R15: ffff88800b5ca160
[    7.415724] FS:  00007f3d04516840(0000) GS:ffff88807da00000(0000) knlGS:0000000000000000
[    7.416227] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    7.416588] CR2: 00007ffcb3979ac8 CR3: 000000000f290003 CR4: 0000000000770ef0
[    7.417032] PKRU: 55555554
[    7.417205] Call Trace:
[    7.417363]  <TASK>
[    7.417502]  ? ext4_mb_mark_group_bb+0x48e/0x4a0
[    7.417807]  ? __warn+0x80/0x170
[    7.418051]  ? ext4_mb_mark_group_bb+0x48e/0x4a0
[    7.418337]  ? report_bug+0x173/0x1d0
[    7.418567]  ? handle_bug+0x3c/0x70
[    7.418797]  ? exc_invalid_op+0x17/0x70
[    7.419037]  ? asm_exc_invalid_op+0x1a/0x20
[    7.419226]  ? ext4_mb_mark_group_bb+0x48e/0x4a0
[    7.419437]  ? ext4_mb_mark_group_bb+0xae/0x4a0
[    7.419708]  ext4_mb_mark_bb+0xc0/0x120
[    7.419946]  ext4_ext_clear_bb+0x210/0x280
[    7.420198]  ext4_fc_replay_inode+0xa1/0x380
[    7.420466]  ext4_fc_replay+0x435/0x880
[    7.420703]  ? __getblk_gfp+0x37/0x110
[    7.420938]  ? jread+0x7a/0x180
[    7.421138]  do_one_pass+0x7df/0x1040
[    7.421365]  jbd2_journal_recover+0x150/0x250
[    7.421637]  jbd2_journal_load+0xbe/0x190
[    7.421886]  ext4_load_journal+0x214/0x610
[    7.422152]  ext4_load_and_init_journal+0x29/0x380
[    7.422490]  __ext4_fill_super+0x15ca/0x15e0
[    7.422756]  ? __pfx_ext4_fill_super+0x10/0x10
[    7.423032]  ext4_fill_super+0xcf/0x280
[    7.423270]  get_tree_bdev+0x188/0x290
[    7.423505]  vfs_get_tree+0x29/0xe0
[    7.423723]  ? capable+0x37/0x70
[    7.423927]  do_new_mount+0x174/0x300
[    7.424157]  __x64_sys_mount+0x11a/0x150
[    7.424401]  do_syscall_64+0x3b/0x90
[    7.424624]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
[    7.424935] RIP: 0033:0x7f3d0475562a
[    7.425160] Code: 48 8b 0d 69 18 0d 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 36 18 0d 00 f7 d8 64 89 01 48
[    7.426298] RSP: 002b:00007ffcb397aaf8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5
[    7.426761] RAX: ffffffffffffffda RBX: 00007f3d04889264 RCX: 00007f3d0475562a
[    7.427197] RDX: 0000558ea381db90 RSI: 0000558ea381dbb0 RDI: 0000558ea381dbd0
[    7.427631] RBP: 0000558ea381d960 R08: 0000558ea381dbf0 R09: 00007f3d04827be0
[    7.428063] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
[    7.428499] R13: 0000558ea381dbd0 R14: 0000558ea381db90 R15: 0000558ea381d960
[    7.428941]  </TASK>
[    7.429083] irq event stamp: 10951
[    7.429296] hardirqs last  enabled at (10959): [<ffffffff811643c2>] __up_console_sem+0x52/0x60
[    7.429824] hardirqs last disabled at (10966): [<ffffffff811643a7>] __up_console_sem+0x37/0x60
[    7.430325] softirqs last  enabled at (10574): [<ffffffff8204a529>] __do_softirq+0x2d9/0x39e
[    7.430839] softirqs last disabled at (10407): [<ffffffff810dcc57>] __irq_exit_rcu+0x87/0xb0
[    7.431354] ---[ end trace 0000000000000000 ]---
Kemeng Shi June 12, 2023, 2:24 a.m. UTC | #2
on 6/11/2023 1:05 PM, Theodore Ts'o wrote:
> On Sat, Jun 03, 2023 at 11:03:21PM +0800, Kemeng Shi wrote:
>> call ext4_mb_mark_group_bb in ext4_free_blocks_simple to:
>> 1. remove repeat code
>> 2. pair update of free_clusters in ext4_mb_new_blocks_simple.
>> 3. add missing ext4_lock_group/ext4_unlock_group protection.
>>
>> Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
>> Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> 
> Note: after bisecting, I've found that this commit is causing a OOPS
> when running "kvm-xfstests -c ext4/adv generic/468".  It appears to be
> an issue with the fast commit feature not playing nice with this
> patch.  The stack trace looks like this:
> 
> [    7.409663] ------------[ cut here ]------------
> [    7.409969] WARNING: CPU: 0 PID: 3069 at fs/ext4/mballoc.c:3801 ext4_mb_mark_group_bb+0x48e/0x4a0
> [    7.410480] CPU: 0 PID: 3069 Comm: mount Not tainted 6.4.0-rc5-xfstests-lockdep-00021-g60ba685c5998 #146
> [    7.411067] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
> [    7.411639] RIP: 0010:ext4_mb_mark_group_bb+0x48e/0x4a0
> [    7.411968] Code: 48 c7 c7 35 b0 88 82 c6 05 16 f4 9b 01 01 e8 f9 16 c9 ff e9 7f fe ff ff 8b 45 08 c7 44 24 10 00 00 00 00 31 c9 e9 ef fc ff ff <0f> 0b e9 76 fc ff ff e8 96 64 b6 00 66 0f 1f 44 00 00 90 90 90 90
> [    7.413128] RSP: 0018:ffffc90003b0f9f8 EFLAGS: 00010246
> [    7.413458] RAX: 0000000000000003 RBX: 0000000000006002 RCX: 0000000000000001
> [    7.413902] RDX: ffff88800965b000 RSI: 0000000000000000 RDI: ffff88800d690100
> [    7.414346] RBP: ffffc90003b0fa68 R08: 000000000aebbd6e R09: 0000000000000246
> [    7.414791] R10: 00000000d148c994 R11: 00000000941da2bb R12: ffff88800d7fd000
> [    7.415234] R13: 0000000000000000 R14: ffff88800f3e4080 R15: ffff88800b5ca160
> [    7.415724] FS:  00007f3d04516840(0000) GS:ffff88807da00000(0000) knlGS:0000000000000000
> [    7.416227] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [    7.416588] CR2: 00007ffcb3979ac8 CR3: 000000000f290003 CR4: 0000000000770ef0
> [    7.417032] PKRU: 55555554
> [    7.417205] Call Trace:
> [    7.417363]  <TASK>
> [    7.417502]  ? ext4_mb_mark_group_bb+0x48e/0x4a0
> [    7.417807]  ? __warn+0x80/0x170
> [    7.418051]  ? ext4_mb_mark_group_bb+0x48e/0x4a0
> [    7.418337]  ? report_bug+0x173/0x1d0
> [    7.418567]  ? handle_bug+0x3c/0x70
> [    7.418797]  ? exc_invalid_op+0x17/0x70
> [    7.419037]  ? asm_exc_invalid_op+0x1a/0x20
> [    7.419226]  ? ext4_mb_mark_group_bb+0x48e/0x4a0
> [    7.419437]  ? ext4_mb_mark_group_bb+0xae/0x4a0
> [    7.419708]  ext4_mb_mark_bb+0xc0/0x120
> [    7.419946]  ext4_ext_clear_bb+0x210/0x280
> [    7.420198]  ext4_fc_replay_inode+0xa1/0x380
> [    7.420466]  ext4_fc_replay+0x435/0x880
> [    7.420703]  ? __getblk_gfp+0x37/0x110
> [    7.420938]  ? jread+0x7a/0x180
> [    7.421138]  do_one_pass+0x7df/0x1040
> [    7.421365]  jbd2_journal_recover+0x150/0x250
> [    7.421637]  jbd2_journal_load+0xbe/0x190
> [    7.421886]  ext4_load_journal+0x214/0x610
> [    7.422152]  ext4_load_and_init_journal+0x29/0x380
> [    7.422490]  __ext4_fill_super+0x15ca/0x15e0
> [    7.422756]  ? __pfx_ext4_fill_super+0x10/0x10
> [    7.423032]  ext4_fill_super+0xcf/0x280
> [    7.423270]  get_tree_bdev+0x188/0x290
> [    7.423505]  vfs_get_tree+0x29/0xe0
> [    7.423723]  ? capable+0x37/0x70
> [    7.423927]  do_new_mount+0x174/0x300
> [    7.424157]  __x64_sys_mount+0x11a/0x150
> [    7.424401]  do_syscall_64+0x3b/0x90
> [    7.424624]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
> [    7.424935] RIP: 0033:0x7f3d0475562a
> [    7.425160] Code: 48 8b 0d 69 18 0d 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 36 18 0d 00 f7 d8 64 89 01 48
> [    7.426298] RSP: 002b:00007ffcb397aaf8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5
> [    7.426761] RAX: ffffffffffffffda RBX: 00007f3d04889264 RCX: 00007f3d0475562a
> [    7.427197] RDX: 0000558ea381db90 RSI: 0000558ea381dbb0 RDI: 0000558ea381dbd0
> [    7.427631] RBP: 0000558ea381d960 R08: 0000558ea381dbf0 R09: 00007f3d04827be0
> [    7.428063] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
> [    7.428499] R13: 0000558ea381dbd0 R14: 0000558ea381db90 R15: 0000558ea381d960
> [    7.428941]  </TASK>
> [    7.429083] irq event stamp: 10951
> [    7.429296] hardirqs last  enabled at (10959): [<ffffffff811643c2>] __up_console_sem+0x52/0x60
> [    7.429824] hardirqs last disabled at (10966): [<ffffffff811643a7>] __up_console_sem+0x37/0x60
> [    7.430325] softirqs last  enabled at (10574): [<ffffffff8204a529>] __do_softirq+0x2d9/0x39e
> [    7.430839] softirqs last disabled at (10407): [<ffffffff810dcc57>] __irq_exit_rcu+0x87/0xb0
> [    7.431354] ---[ end trace 0000000000000000 ]---
> 
> 
Hi ted, sorry for this issue. This patch added a WARN_ON for case that we free block
to uninitialized block group which should be invalid.
We can simply remove the WARN_ON to allow free on uninitialized block group as old
way for emergency fix and I will find out why we free blocks to uninitialized block
group in fast commit code path and is it a valid behavior.
Theodore Ts'o June 12, 2023, 3:49 a.m. UTC | #3
On Mon, Jun 12, 2023 at 10:24:55AM +0800, Kemeng Shi wrote:

> Hi ted, sorry for this issue. This patch added a WARN_ON for case that we free block
> to uninitialized block group which should be invalid.
> We can simply remove the WARN_ON to allow free on uninitialized block group as old
> way for emergency fix and I will find out why we free blocks to uninitialized block
> group in fast commit code path and is it a valid behavior.

What I've done for now in the dev branch was to drop patches 12
through 19 of this patch series.  That seemed to be a good break
point, and I wanted to make sure we had something working so we can
start doing a lot more intesive testing on the patches so far.

Also, that way, when you resend the last 8 patches in the patch
series, we can make sure they get a proper review as opposed to making
changes on the fly.

The current contents of the dev branch are:

% git log --reverse --oneline origin..dev
40fa8be3852f ext4: kill unused function ext4_journalled_write_inline_data
a030569c34be ext4: Change remaining tracepoints to use folio
d1ffc6fb5ded ext4: Make mpage_journal_page_buffers use folio
5ac99c22fa84 ext4: Make ext4_write_inline_data_end() use folio
d578dfc510cf ext4: Call fsverity_verify_folio()
30f0bd64ed09 ext4: fix wrong unit use in ext4_mb_normalize_request
b9dc976cc348 ext4: fix unit mismatch in ext4_mb_new_blocks_simple
9afc5e21107a ext4: fix wrong unit use in ext4_mb_find_by_goal
860f86ccff6e ext4: treat stripe in block unit
710c384f1536 ext4: add EXT4_MB_HINT_GOAL_ONLY test in ext4_mb_use_preallocated
f242d8a98a6f ext4: remove ext4_block_group and ext4_block_group_offset declaration
5b859728b98b ext4: try all groups in ext4_mb_new_blocks_simple
ea7bbd168135 ext4: get block from bh before pass it to ext4_free_blocks_simple in ext4_free_blocks
757d9100a5d1 ext4: remove unsed parameter and unnecessary forward declaration of ext4_mb_new_blocks_simple
5d62e6da25f5 ext4: fix wrong unit use in ext4_mb_clear_bb
993d22f0a250 ext4: fix wrong unit use in ext4_mb_new_blocks
bf4f2aa4844a ext4: mballoc: Remove useless setting of ac_criteria
743f4dd07bf9 ext4: Remove unused extern variables declaration
bc40109767b3 ext4: Convert mballoc cr (criteria) to enum
52e3814a1342 ext4: Add per CR extent scanned counter
a15c09da1255 ext4: Add counter to track successful allocation of goal length
26cbe38f0275 ext4: Avoid scanning smaller extents in BG during CR1
9c8f8195852c ext4: Don't skip prefetching BLOCK_UNINIT groups
cd303d98b9b5 ext4: Ensure ext4_mb_prefetch_fini() is called for all prefetched BGs
ea639ce794e5 ext4: Abstract out logic to search average fragment list
b080c84db854 ext4: Add allocation criteria 1.5 (CR1_5)
3a08f7ac3bfa ext4: Give symbolic names to mballoc criterias
d14b5d0b1373 ext4: only update i_reserved_data_blocks on successful block allocation
b352d1f09a20 ext4: add a new helper to check if es must be kept
579c020ea7b7 ext4: factor out __es_alloc_extent() and __es_free_extent()
f4ddcde91d00 ext4: use pre-allocated es in __es_insert_extent()
e77481862663 ext4: use pre-allocated es in __es_remove_extent()
28774513875c ext4: using nofail preallocation in ext4_es_remove_extent()
e109a1db5b09 ext4: using nofail preallocation in ext4_es_insert_delayed_block()
14d876070f03 ext4: using nofail preallocation in ext4_es_insert_extent()
2af6f615b18b ext4: make ext4_es_remove_extent() return void
0ee9cccd1971 ext4: make ext4_es_insert_delayed_block() return void
7a7c285c485d ext4: make ext4_es_insert_extent() return void
9d1c6dea1aa3 ext4: make ext4_zeroout_es() return void
2e3f4cdef544 ext4: clean up mballoc criteria comments
acef67482edf ext4: allow concurrent unaligned dio overwrites
63bc068f0d1a ext4: Fix reusing stale buffer heads from last failed mounting
3a57c2f88be3 ext4: ext4_put_super: Remove redundant checking for 'sbi->s_journal_bdev'
6b960d2155f9 jbd2: remove unused feature macros
4b049709e652 jbd2: switch to check format version in superblock directly
d9eafe0afafa jbd2: factor out journal initialization from journal_get_superblock()
6eecd1f4c7ef jbd2: remove j_format_version
431ca11fafd3 jbd2: continue to record log between each mount
2ea31402649c ext4: add journal cycled recording support
a228f0e153f6 ext4: update doc about journal superblock description
f9c45d83f4da ext4: turning quotas off if mount failed after enable quotas
5404e4738054 ext4: refactoring to use the unified helper ext4_quotas_off()
d3ab1bca26b4 jbd2: recheck chechpointing non-dirty buffer
7b0cfe40a991 jbd2: remove t_checkpoint_io_list
e86f802ab8d4 jbd2: remove journal_clean_one_cp_list()
e8ece5c78f36 jbd2: Fix wrongly judgement for buffer head removing while doing checkpoint
cdffaad9649e jbd2: fix a race when checking checkpoint buffer busy
11761ed6026e jbd2: remove __journal_try_to_free_buffer()

Cheers,

					- Ted
Kemeng Shi June 13, 2023, 1:22 a.m. UTC | #4
on 6/12/2023 11:49 AM, Theodore Ts'o wrote:
> On Mon, Jun 12, 2023 at 10:24:55AM +0800, Kemeng Shi wrote:
> 
>> Hi ted, sorry for this issue. This patch added a WARN_ON for case that we free block
>> to uninitialized block group which should be invalid.
>> We can simply remove the WARN_ON to allow free on uninitialized block group as old
>> way for emergency fix and I will find out why we free blocks to uninitialized block
>> group in fast commit code path and is it a valid behavior.
> 
> What I've done for now in the dev branch was to drop patches 12
> through 19 of this patch series.  That seemed to be a good break
> point, and I wanted to make sure we had something working so we can
> start doing a lot more intesive testing on the patches so far.
> 
> Also, that way, when you resend the last 8 patches in the patch
> series, we can make sure they get a proper review as opposed to making
> changes on the fly.
Sure, I will resend last 8 patches after I solve the issue. I can also take my time
to look at problem in this way :)
> The current contents of the dev branch are:
> 
> % git log --reverse --oneline origin..dev
> 40fa8be3852f ext4: kill unused function ext4_journalled_write_inline_data
> a030569c34be ext4: Change remaining tracepoints to use folio
> d1ffc6fb5ded ext4: Make mpage_journal_page_buffers use folio
> 5ac99c22fa84 ext4: Make ext4_write_inline_data_end() use folio
> d578dfc510cf ext4: Call fsverity_verify_folio()
> 30f0bd64ed09 ext4: fix wrong unit use in ext4_mb_normalize_request
> b9dc976cc348 ext4: fix unit mismatch in ext4_mb_new_blocks_simple
> 9afc5e21107a ext4: fix wrong unit use in ext4_mb_find_by_goal
> 860f86ccff6e ext4: treat stripe in block unit
> 710c384f1536 ext4: add EXT4_MB_HINT_GOAL_ONLY test in ext4_mb_use_preallocated
> f242d8a98a6f ext4: remove ext4_block_group and ext4_block_group_offset declaration
> 5b859728b98b ext4: try all groups in ext4_mb_new_blocks_simple
> ea7bbd168135 ext4: get block from bh before pass it to ext4_free_blocks_simple in ext4_free_blocks
> 757d9100a5d1 ext4: remove unsed parameter and unnecessary forward declaration of ext4_mb_new_blocks_simple
> 5d62e6da25f5 ext4: fix wrong unit use in ext4_mb_clear_bb
> 993d22f0a250 ext4: fix wrong unit use in ext4_mb_new_blocks
> bf4f2aa4844a ext4: mballoc: Remove useless setting of ac_criteria
> 743f4dd07bf9 ext4: Remove unused extern variables declaration
> bc40109767b3 ext4: Convert mballoc cr (criteria) to enum
> 52e3814a1342 ext4: Add per CR extent scanned counter
> a15c09da1255 ext4: Add counter to track successful allocation of goal length
> 26cbe38f0275 ext4: Avoid scanning smaller extents in BG during CR1
> 9c8f8195852c ext4: Don't skip prefetching BLOCK_UNINIT groups
> cd303d98b9b5 ext4: Ensure ext4_mb_prefetch_fini() is called for all prefetched BGs
> ea639ce794e5 ext4: Abstract out logic to search average fragment list
> b080c84db854 ext4: Add allocation criteria 1.5 (CR1_5)
> 3a08f7ac3bfa ext4: Give symbolic names to mballoc criterias
> d14b5d0b1373 ext4: only update i_reserved_data_blocks on successful block allocation
> b352d1f09a20 ext4: add a new helper to check if es must be kept
> 579c020ea7b7 ext4: factor out __es_alloc_extent() and __es_free_extent()
> f4ddcde91d00 ext4: use pre-allocated es in __es_insert_extent()
> e77481862663 ext4: use pre-allocated es in __es_remove_extent()
> 28774513875c ext4: using nofail preallocation in ext4_es_remove_extent()
> e109a1db5b09 ext4: using nofail preallocation in ext4_es_insert_delayed_block()
> 14d876070f03 ext4: using nofail preallocation in ext4_es_insert_extent()
> 2af6f615b18b ext4: make ext4_es_remove_extent() return void
> 0ee9cccd1971 ext4: make ext4_es_insert_delayed_block() return void
> 7a7c285c485d ext4: make ext4_es_insert_extent() return void
> 9d1c6dea1aa3 ext4: make ext4_zeroout_es() return void
> 2e3f4cdef544 ext4: clean up mballoc criteria comments
> acef67482edf ext4: allow concurrent unaligned dio overwrites
> 63bc068f0d1a ext4: Fix reusing stale buffer heads from last failed mounting
> 3a57c2f88be3 ext4: ext4_put_super: Remove redundant checking for 'sbi->s_journal_bdev'
> 6b960d2155f9 jbd2: remove unused feature macros
> 4b049709e652 jbd2: switch to check format version in superblock directly
> d9eafe0afafa jbd2: factor out journal initialization from journal_get_superblock()
> 6eecd1f4c7ef jbd2: remove j_format_version
> 431ca11fafd3 jbd2: continue to record log between each mount
> 2ea31402649c ext4: add journal cycled recording support
> a228f0e153f6 ext4: update doc about journal superblock description
> f9c45d83f4da ext4: turning quotas off if mount failed after enable quotas
> 5404e4738054 ext4: refactoring to use the unified helper ext4_quotas_off()
> d3ab1bca26b4 jbd2: recheck chechpointing non-dirty buffer
> 7b0cfe40a991 jbd2: remove t_checkpoint_io_list
> e86f802ab8d4 jbd2: remove journal_clean_one_cp_list()
> e8ece5c78f36 jbd2: Fix wrongly judgement for buffer head removing while doing checkpoint
> cdffaad9649e jbd2: fix a race when checking checkpoint buffer busy
> 11761ed6026e jbd2: remove __journal_try_to_free_buffer()
> 
> Cheers,
> 
> 					- Ted
>
Kemeng Shi June 20, 2023, 1:50 a.m. UTC | #5
on 6/13/2023 9:22 AM, Kemeng Shi wrote:
> 
> 
> on 6/12/2023 11:49 AM, Theodore Ts'o wrote:
>> On Mon, Jun 12, 2023 at 10:24:55AM +0800, Kemeng Shi wrote:
>>
>>> Hi ted, sorry for this issue. This patch added a WARN_ON for case that we free block
>>> to uninitialized block group which should be invalid.
>>> We can simply remove the WARN_ON to allow free on uninitialized block group as old
>>> way for emergency fix and I will find out why we free blocks to uninitialized block
>>> group in fast commit code path and is it a valid behavior.
>>
>> What I've done for now in the dev branch was to drop patches 12
>> through 19 of this patch series.  That seemed to be a good break
>> point, and I wanted to make sure we had something working so we can
>> start doing a lot more intesive testing on the patches so far.
>>
>> Also, that way, when you resend the last 8 patches in the patch
>> series, we can make sure they get a proper review as opposed to making
>> changes on the fly.
> Sure, I will resend last 8 patches after I solve the issue. I can also take my time
> to look at problem in this way :)
Updates for how WARN_ON of free blocks to uninitialized block group is triggerred under
fast commit path in test generic/468.

# /sbin/mkfs.ext4  -F  -q -O inline_data,fast_commit  /dev/vdc
# /bin/mount -t ext4 -o acl,user_xattr -o block_validity /dev/vdc /vdc
# /root/xfstests/bin/xfs_io -i -f -c 'truncate 4202496' -c 'pwrite 0
4202496' -c fsync -c 'falloc  4202496 104857600' /vdc/testfile

The "falloc  4202496 104857600" will trigger block allocation in a
new uninitialized block group for file range "4202496 104857600" as
following:
ext4_map_blocks
  /*
   * Alloc blocks from uninitialized block group. Change to set
   * group intialized will be full journaled.
   */
  ext4_mb_new_blocks

  [...]

  /*
   * New extents will be tracked in fast commit.
   */
  ext4_fc_track_range

  /*
   * Add new extents of allocated range to inode which still has sapce
   * in ext_inode_hdr
   */
  ext4_ext_insert_extent
    [...]
    /*
     * depth is 0 as inode has space in ext_inode_hdr, this will track
     * inode in fast commit.
     */
    ext4_ext_dirty(handle, inode, path + path->p_depth);
      ext4_mark_inode_dirty
        ext4_mark_iloc_dirty
          ext4_fc_track_inode

# /root/xfstests/bin/xfs_io -i -c fsync /vdc/testfile
The fast commit is performed in fsync as following:
vfs_fsync
  ext4_fsync_journal
    ext4_fc_commit
      ext4_fc_perform_commit
        add EXT4_FC_TAG_ADD_RANGE of new extent range
        add EXT4_FC_TAG_INODE of changed inode

# /root/xfstests/src/godown /vdc
Journaled change to set group intialized is discard as following:
ext4_shutdown
  jbd2_journal_abort

# /bin/umount /dev/vdc
# /bin/mount -t ext4 -o acl,user_xattr -o block_validity /dev/vdc /vdc
Replay fast commit when mounting and added WARN_ON is triggered as
following:
ext4_fc_replay
  /*
   * replay EXT4_FC_TAG_ADD_RANGE, add extents contains blocks from
   * uninitialized group back to inode
   */
  ext4_fc_replay_add_range

  /*
   * replay EXT4_FC_TAG_INODE, this will mark trigger WARN_ON
   */
  ext4_fc_replay_inode
    /*
     * mark all blocks in old inode free, then blocks from uninitialized
     * block is freed and WARN_ON occurs
     */
    ext4_ext_clear_bb

    /* update inode with data journaled in fast commit */
    [...]

    /*
     * mark all blocks in new inode in use, and gdp will be mark
     * initialized normally
     */
    ext4_fc_record_modified_inode
    [...]
    ext4_fc_set_bitmaps_and_counters

In this situation, free blocks to uninitialized block group do no harm.
And there may be more harmless situations, so I would like to simply
drop WARN_ON in next version.
diff mbox series

Patch

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 685dcc17bf7c..dae4533411f7 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3798,6 +3798,8 @@  ext4_mb_mark_group_bb(struct ext4_mark_context *mc, ext4_group_t group,
 	ext4_lock_group(sb, group);
 	if (ext4_has_group_desc_csum(sb) &&
 	    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+		WARN_ON(mc->state == 0);
+
 		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
 		ext4_free_group_clusters_set(sb, gdp,
 			ext4_free_clusters_after_init(sb, group, gdp));
@@ -6120,43 +6122,16 @@  ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
 					unsigned long count)
 {
-	struct buffer_head *bitmap_bh;
+	struct ext4_mark_context mc = {
+		.sb = inode->i_sb,
+		.state = 0,
+	};
 	struct super_block *sb = inode->i_sb;
-	struct ext4_group_desc *gdp;
-	struct buffer_head *gdp_bh;
 	ext4_group_t group;
 	ext4_grpblk_t blkoff;
-	int already_freed = 0, err, i;
 
 	ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
-	bitmap_bh = ext4_read_block_bitmap(sb, group);
-	if (IS_ERR(bitmap_bh)) {
-		pr_warn("Failed to read block bitmap\n");
-		return;
-	}
-	gdp = ext4_get_group_desc(sb, group, &gdp_bh);
-	if (!gdp)
-		goto err_out;
-
-	for (i = 0; i < count; i++) {
-		if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
-			already_freed++;
-	}
-	mb_clear_bits(bitmap_bh->b_data, blkoff, count);
-	err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
-	if (err)
-		goto err_out;
-	ext4_free_group_clusters_set(
-		sb, gdp, ext4_free_group_clusters(sb, gdp) +
-		count - already_freed);
-	ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
-	ext4_group_desc_csum_set(sb, group, gdp);
-	ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
-	sync_dirty_buffer(bitmap_bh);
-	sync_dirty_buffer(gdp_bh);
-
-err_out:
-	brelse(bitmap_bh);
+	ext4_mb_mark_group_bb(&mc, group, blkoff, count);
 }
 
 /**