Message ID | 20250512063319.3539411-9-yi.zhang@huaweicloud.com |
---|---|
State | Awaiting Upstream |
Headers | show |
Series | ext4: enable large folio for regular files | expand |
Hello, kernel test robot noticed a 37.7% improvement of fsmark.files_per_sec on: commit: 34696dd792d839c46a280c720ab28aab2db1f4bf ("[PATCH v2 8/8] ext4: enable large folio for regular file") url: https://github.com/intel-lab-lkp/linux/commits/Zhang-Yi/ext4-make-ext4_mpage_readpages-support-large-folios/20250512-144942 base: https://git.kernel.org/cgit/linux/kernel/git/tytso/ext4.git dev patch link: https://lore.kernel.org/all/20250512063319.3539411-9-yi.zhang@huaweicloud.com/ patch subject: [PATCH v2 8/8] ext4: enable large folio for regular file testcase: fsmark config: x86_64-rhel-9.4 compiler: gcc-12 test machine: 96 threads 2 sockets Intel(R) Xeon(R) Platinum 8260L CPU @ 2.40GHz (Cascade Lake) with 128G memory parameters: iterations: 1x nr_threads: 1t disk: 1BRD_48G fs: ext4 filesize: 4M test_size: 24G sync_method: NoSync cpufreq_governor: performance Details are as below: --------------------------------------------------------------------------------------------------> The kernel config and materials to reproduce are available at: https://download.01.org/0day-ci/archive/20250516/202505161418.ec0d753f-lkp@intel.com ========================================================================================= compiler/cpufreq_governor/disk/filesize/fs/iterations/kconfig/nr_threads/rootfs/sync_method/tbox_group/test_size/testcase: gcc-12/performance/1BRD_48G/4M/ext4/1x/x86_64-rhel-9.4/1t/debian-12-x86_64-20240206.cgz/NoSync/lkp-csl-2sp3/24G/fsmark commit: 0368e6caf2 ("ext4: make online defragmentation support large folios") 34696dd792 ("ext4: enable large folio for regular file") 0368e6caf2d6ff21 34696dd792d839c46a280c720ab ---------------- --------------------------- %stddev %change %stddev \ | \ 1.964e+09 ± 3% -14.9% 1.671e+09 ± 8% cpuidle..time 3825 ± 20% -24.6% 2884 ± 17% sched_debug.cpu.avg_idle.min 69081 -3.2% 66894 fsmark.app_overhead 529.15 +37.7% 728.75 fsmark.files_per_sec 70.33 ± 3% -11.6% 62.17 fsmark.time.percent_of_cpu_this_job_got 3.50 ± 54% +109.5% 7.33 ± 21% perf-sched.wait_and_delay.count.schedule_hrtimeout_range.ep_poll.do_epoll_wait.__x64_sys_epoll_wait 4.00 ± 14% +21.0% 4.84 ± 21% perf-sched.wait_and_delay.max.ms.schedule_timeout.rcu_gp_fqs_loop.rcu_gp_kthread.kthread 0.91 ± 12% -14.1% 0.78 ± 8% perf-sched.wait_time.max.ms.do_wait.kernel_wait4.do_syscall_64.entry_SYSCALL_64_after_hwframe 691923 ± 25% -62.0% 263011 ± 18% proc-vmstat.numa_foreign 12650103 ± 4% -35.5% 8153366 proc-vmstat.numa_hit 12607229 ± 3% -36.1% 8052736 proc-vmstat.numa_local 707756 ± 22% -62.8% 263011 ± 18% proc-vmstat.numa_miss 806083 ± 25% -55.0% 362335 ± 13% proc-vmstat.numa_other 231973 -3.4% 224103 proc-vmstat.pgfault 3.86 ± 2% +37.3% 5.29 ± 9% perf-stat.i.MPKI 1.889e+09 -11.1% 1.679e+09 ± 2% perf-stat.i.branch-instructions 3.60 ± 3% +0.3 3.91 ± 5% perf-stat.i.branch-miss-rate% 34677646 +17.9% 40883980 ± 8% perf-stat.i.cache-misses 2003 ± 2% +10.2% 2209 ± 4% perf-stat.i.context-switches 1.08 +14.4% 1.24 ± 4% perf-stat.i.cpi 9.762e+09 -10.6% 8.728e+09 ± 2% perf-stat.i.instructions 0.98 -9.7% 0.89 ± 4% perf-stat.i.ipc 7068 ± 3% +14.7% 8105 ± 6% perf-stat.i.minor-faults 7068 ± 3% +14.7% 8105 ± 6% perf-stat.i.page-faults 3.56 +31.5% 4.68 ± 6% perf-stat.overall.MPKI 3.67 ± 2% +0.4 4.06 ± 2% perf-stat.overall.branch-miss-rate% 1.04 +10.9% 1.15 ± 2% perf-stat.overall.cpi 291.68 -15.5% 246.52 ± 4% perf-stat.overall.cycles-between-cache-misses 0.96 -9.8% 0.87 ± 2% perf-stat.overall.ipc 1.795e+09 -12.0% 1.58e+09 perf-stat.ps.branch-instructions 65831109 -2.5% 64181687 perf-stat.ps.branch-misses 32991347 +16.6% 38460099 ± 7% perf-stat.ps.cache-misses 1905 ± 2% +9.0% 2077 ± 4% perf-stat.ps.context-switches 9.28e+09 -11.5% 8.215e+09 ± 2% perf-stat.ps.instructions 6695 ± 2% +13.6% 7604 ± 5% perf-stat.ps.minor-faults 6696 ± 2% +13.6% 7604 ± 5% perf-stat.ps.page-faults 1.918e+11 ± 2% -25.3% 1.432e+11 ± 7% perf-stat.total.instructions 24.59 ± 5% -7.7 16.86 perf-profile.calltrace.cycles-pp.ext4_buffered_write_iter.vfs_write.ksys_write.do_syscall_64.entry_SYSCALL_64_after_hwframe 24.39 ± 5% -7.7 16.68 perf-profile.calltrace.cycles-pp.generic_perform_write.ext4_buffered_write_iter.vfs_write.ksys_write.do_syscall_64 30.60 -6.3 24.26 perf-profile.calltrace.cycles-pp.write 29.63 -6.3 23.36 perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_after_hwframe.write 29.39 -6.3 23.12 perf-profile.calltrace.cycles-pp.vfs_write.ksys_write.do_syscall_64.entry_SYSCALL_64_after_hwframe.write 29.60 -6.3 23.34 perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe.write 29.50 -6.3 23.24 perf-profile.calltrace.cycles-pp.ksys_write.do_syscall_64.entry_SYSCALL_64_after_hwframe.write 6.31 ± 3% -5.1 1.24 ± 6% perf-profile.calltrace.cycles-pp.ext4_da_do_write_end.generic_perform_write.ext4_buffered_write_iter.vfs_write.ksys_write 5.84 ± 3% -4.7 1.17 ± 6% perf-profile.calltrace.cycles-pp.block_write_end.ext4_da_do_write_end.generic_perform_write.ext4_buffered_write_iter.vfs_write 5.79 ± 3% -4.6 1.16 ± 6% perf-profile.calltrace.cycles-pp.__block_commit_write.block_write_end.ext4_da_do_write_end.generic_perform_write.ext4_buffered_write_iter 9.66 ± 5% -4.0 5.62 ± 2% perf-profile.calltrace.cycles-pp.ext4_da_write_begin.generic_perform_write.ext4_buffered_write_iter.vfs_write.ksys_write 17.24 ± 5% -3.7 13.49 perf-profile.calltrace.cycles-pp.__writeback_single_inode.writeback_sb_inodes.__writeback_inodes_wb.wb_writeback.wb_do_writeback 17.24 ± 5% -3.7 13.49 perf-profile.calltrace.cycles-pp.do_writepages.__writeback_single_inode.writeback_sb_inodes.__writeback_inodes_wb.wb_writeback 17.24 ± 5% -3.7 13.50 perf-profile.calltrace.cycles-pp.__writeback_inodes_wb.wb_writeback.wb_do_writeback.wb_workfn.process_one_work 17.24 ± 5% -3.7 13.50 perf-profile.calltrace.cycles-pp.writeback_sb_inodes.__writeback_inodes_wb.wb_writeback.wb_do_writeback.wb_workfn 17.24 ± 5% -3.7 13.49 perf-profile.calltrace.cycles-pp.ext4_writepages.do_writepages.__writeback_single_inode.writeback_sb_inodes.__writeback_inodes_wb 4.92 ± 6% -3.1 1.87 ± 2% perf-profile.calltrace.cycles-pp.__filemap_get_folio.ext4_da_write_begin.generic_perform_write.ext4_buffered_write_iter.vfs_write 2.60 ± 5% -1.7 0.94 ± 4% perf-profile.calltrace.cycles-pp.filemap_add_folio.__filemap_get_folio.ext4_da_write_begin.generic_perform_write.ext4_buffered_write_iter 1.78 ± 8% -1.1 0.68 ± 7% perf-profile.calltrace.cycles-pp.alloc_pages_mpol.folio_alloc_noprof.__filemap_get_folio.ext4_da_write_begin.generic_perform_write 1.56 ± 8% -1.1 0.48 ± 44% perf-profile.calltrace.cycles-pp.mpage_submit_folio.mpage_map_and_submit_buffers.mpage_map_and_submit_extent.ext4_do_writepages.ext4_writepages 1.86 ± 7% -1.0 0.82 ± 6% perf-profile.calltrace.cycles-pp.folio_alloc_noprof.__filemap_get_folio.ext4_da_write_begin.generic_perform_write.ext4_buffered_write_iter 1.65 ± 7% -1.0 0.64 ± 7% perf-profile.calltrace.cycles-pp.__alloc_frozen_pages_noprof.alloc_pages_mpol.folio_alloc_noprof.__filemap_get_folio.ext4_da_write_begin 4.54 ± 5% -0.9 3.67 ± 3% perf-profile.calltrace.cycles-pp.ext4_block_write_begin.ext4_da_write_begin.generic_perform_write.ext4_buffered_write_iter.vfs_write 1.93 ± 6% -0.8 1.08 ± 32% perf-profile.calltrace.cycles-pp.ext4_finish_bio.ext4_release_io_end.ext4_end_io_end.ext4_do_writepages.ext4_writepages 1.93 ± 6% -0.8 1.09 ± 31% perf-profile.calltrace.cycles-pp.ext4_release_io_end.ext4_end_io_end.ext4_do_writepages.ext4_writepages.do_writepages 1.42 ± 7% -0.8 0.58 ± 9% perf-profile.calltrace.cycles-pp.get_page_from_freelist.__alloc_frozen_pages_noprof.alloc_pages_mpol.folio_alloc_noprof.__filemap_get_folio 1.96 ± 6% -0.8 1.20 ± 26% perf-profile.calltrace.cycles-pp.ext4_end_io_end.ext4_do_writepages.ext4_writepages.do_writepages.__writeback_single_inode 1.31 ± 3% -0.7 0.58 ± 6% perf-profile.calltrace.cycles-pp.mark_buffer_dirty.__block_commit_write.block_write_end.ext4_da_do_write_end.generic_perform_write 2.04 ± 9% -0.7 1.35 ± 7% perf-profile.calltrace.cycles-pp.mpage_map_and_submit_buffers.mpage_map_and_submit_extent.ext4_do_writepages.ext4_writepages.do_writepages 2.11 ± 9% -0.6 1.46 ± 7% perf-profile.calltrace.cycles-pp.mpage_map_and_submit_extent.ext4_do_writepages.ext4_writepages.do_writepages.__writeback_single_inode 1.84 ± 7% -0.5 1.30 ± 7% perf-profile.calltrace.cycles-pp.create_empty_buffers.ext4_block_write_begin.ext4_da_write_begin.generic_perform_write.ext4_buffered_write_iter 1.38 ± 8% -0.2 1.18 ± 7% perf-profile.calltrace.cycles-pp.folio_alloc_buffers.create_empty_buffers.ext4_block_write_begin.ext4_da_write_begin.generic_perform_write 0.62 ± 2% -0.1 0.56 ± 5% perf-profile.calltrace.cycles-pp.ext4_es_lookup_extent.ext4_da_map_blocks.ext4_da_get_block_prep.ext4_block_write_begin.ext4_da_write_begin 0.40 ± 70% +0.3 0.67 ± 5% perf-profile.calltrace.cycles-pp.io_serial_out.serial8250_console_write.console_flush_all.console_unlock.vprintk_emit 1.34 ± 9% +0.6 1.96 ± 4% perf-profile.calltrace.cycles-pp.clear_page_erms.get_page_from_freelist.__alloc_frozen_pages_noprof.alloc_pages_mpol.alloc_pages_noprof 0.34 ±103% +0.8 1.13 ± 25% perf-profile.calltrace.cycles-pp.get_jiffies_update.tmigr_requires_handle_remote.update_process_times.tick_nohz_handler.__hrtimer_run_queues 1.84 ± 12% +0.8 2.62 ± 10% perf-profile.calltrace.cycles-pp.update_process_times.tick_nohz_handler.__hrtimer_run_queues.hrtimer_interrupt.__sysvec_apic_timer_interrupt 0.35 ±103% +0.8 1.15 ± 24% perf-profile.calltrace.cycles-pp.tmigr_requires_handle_remote.update_process_times.tick_nohz_handler.__hrtimer_run_queues.hrtimer_interrupt 4.14 ± 8% +0.9 5.02 ± 9% perf-profile.calltrace.cycles-pp.__sysvec_apic_timer_interrupt.sysvec_apic_timer_interrupt.asm_sysvec_apic_timer_interrupt.cpuidle_enter_state.cpuidle_enter 4.11 ± 8% +0.9 5.00 ± 9% perf-profile.calltrace.cycles-pp.hrtimer_interrupt.__sysvec_apic_timer_interrupt.sysvec_apic_timer_interrupt.asm_sysvec_apic_timer_interrupt.cpuidle_enter_state 1.95 ± 6% +0.9 2.89 ± 3% perf-profile.calltrace.cycles-pp.get_page_from_freelist.__alloc_frozen_pages_noprof.alloc_pages_mpol.alloc_pages_noprof.brd_insert_page 2.35 ± 10% +1.0 3.32 ± 12% perf-profile.calltrace.cycles-pp.tick_nohz_handler.__hrtimer_run_queues.hrtimer_interrupt.__sysvec_apic_timer_interrupt.sysvec_apic_timer_interrupt 3.21 ± 9% +1.0 4.21 ± 10% perf-profile.calltrace.cycles-pp.__hrtimer_run_queues.hrtimer_interrupt.__sysvec_apic_timer_interrupt.sysvec_apic_timer_interrupt.asm_sysvec_apic_timer_interrupt 2.10 ± 6% +1.0 3.12 ± 2% perf-profile.calltrace.cycles-pp.__alloc_frozen_pages_noprof.alloc_pages_mpol.alloc_pages_noprof.brd_insert_page.brd_submit_bio 2.21 ± 5% +1.1 3.30 ± 3% perf-profile.calltrace.cycles-pp.alloc_pages_mpol.alloc_pages_noprof.brd_insert_page.brd_submit_bio.__submit_bio 2.26 ± 5% +1.1 3.37 ± 3% perf-profile.calltrace.cycles-pp.alloc_pages_noprof.brd_insert_page.brd_submit_bio.__submit_bio.__submit_bio_noacct 3.70 ± 30% +1.2 4.90 ± 4% perf-profile.calltrace.cycles-pp.wait_for_lsr.serial8250_console_write.console_flush_all.console_unlock.vprintk_emit 4.40 ± 29% +1.4 5.82 ± 4% perf-profile.calltrace.cycles-pp.serial8250_console_write.console_flush_all.console_unlock.vprintk_emit.devkmsg_emit 4.96 ± 32% +1.4 6.41 perf-profile.calltrace.cycles-pp.memcpy_toio.drm_fb_memcpy.ast_primary_plane_helper_atomic_update.drm_atomic_helper_commit_planes.drm_atomic_helper_commit_tail 4.68 ± 30% +1.5 6.17 ± 4% perf-profile.calltrace.cycles-pp.devkmsg_write.vfs_write.ksys_write.do_syscall_64.entry_SYSCALL_64_after_hwframe 4.68 ± 30% +1.5 6.17 ± 4% perf-profile.calltrace.cycles-pp.devkmsg_emit.devkmsg_write.vfs_write.ksys_write.do_syscall_64 4.68 ± 30% +1.5 6.17 ± 4% perf-profile.calltrace.cycles-pp.vprintk_emit.devkmsg_emit.devkmsg_write.vfs_write.ksys_write 4.68 ± 30% +1.5 6.17 ± 4% perf-profile.calltrace.cycles-pp.console_unlock.vprintk_emit.devkmsg_emit.devkmsg_write.vfs_write 4.68 ± 30% +1.5 6.17 ± 4% perf-profile.calltrace.cycles-pp.console_flush_all.console_unlock.vprintk_emit.devkmsg_emit.devkmsg_write 5.04 ± 32% +1.5 6.54 perf-profile.calltrace.cycles-pp.drm_fb_memcpy.ast_primary_plane_helper_atomic_update.drm_atomic_helper_commit_planes.drm_atomic_helper_commit_tail.ast_mode_config_helper_atomic_commit_tail 5.06 ± 32% +1.5 6.59 perf-profile.calltrace.cycles-pp.ast_primary_plane_helper_atomic_update.drm_atomic_helper_commit_planes.drm_atomic_helper_commit_tail.ast_mode_config_helper_atomic_commit_tail.commit_tail 5.07 ± 32% +1.5 6.60 perf-profile.calltrace.cycles-pp.drm_atomic_commit.drm_atomic_helper_dirtyfb.drm_fbdev_shmem_helper_fb_dirty.drm_fb_helper_damage_work.process_one_work 5.07 ± 32% +1.5 6.59 perf-profile.calltrace.cycles-pp.drm_atomic_helper_commit_planes.drm_atomic_helper_commit_tail.ast_mode_config_helper_atomic_commit_tail.commit_tail.drm_atomic_helper_commit 5.07 ± 32% +1.5 6.60 perf-profile.calltrace.cycles-pp.drm_atomic_helper_commit_tail.ast_mode_config_helper_atomic_commit_tail.commit_tail.drm_atomic_helper_commit.drm_atomic_commit 5.07 ± 32% +1.5 6.60 perf-profile.calltrace.cycles-pp.ast_mode_config_helper_atomic_commit_tail.commit_tail.drm_atomic_helper_commit.drm_atomic_commit.drm_atomic_helper_dirtyfb 5.07 ± 32% +1.5 6.60 perf-profile.calltrace.cycles-pp.commit_tail.drm_atomic_helper_commit.drm_atomic_commit.drm_atomic_helper_dirtyfb.drm_fbdev_shmem_helper_fb_dirty 5.07 ± 32% +1.5 6.60 perf-profile.calltrace.cycles-pp.drm_atomic_helper_commit.drm_atomic_commit.drm_atomic_helper_dirtyfb.drm_fbdev_shmem_helper_fb_dirty.drm_fb_helper_damage_work 5.07 ± 32% +1.5 6.60 perf-profile.calltrace.cycles-pp.drm_atomic_helper_dirtyfb.drm_fbdev_shmem_helper_fb_dirty.drm_fb_helper_damage_work.process_one_work.worker_thread 5.07 ± 32% +1.5 6.60 perf-profile.calltrace.cycles-pp.drm_fb_helper_damage_work.process_one_work.worker_thread.kthread.ret_from_fork 5.07 ± 32% +1.5 6.60 perf-profile.calltrace.cycles-pp.drm_fbdev_shmem_helper_fb_dirty.drm_fb_helper_damage_work.process_one_work.worker_thread.kthread 7.23 ± 9% +1.9 9.10 ± 2% perf-profile.calltrace.cycles-pp.rep_movs_alternative.copy_page_from_iter_atomic.generic_perform_write.ext4_buffered_write_iter.vfs_write 7.48 ± 9% +1.9 9.39 ± 2% perf-profile.calltrace.cycles-pp.copy_page_from_iter_atomic.generic_perform_write.ext4_buffered_write_iter.vfs_write.ksys_write 3.78 ± 7% +2.2 5.96 ± 7% perf-profile.calltrace.cycles-pp.memcpy_orig.copy_to_brd.brd_submit_bio.__submit_bio.__submit_bio_noacct 4.02 ± 7% +2.3 6.28 ± 6% perf-profile.calltrace.cycles-pp.copy_to_brd.brd_submit_bio.__submit_bio.__submit_bio_noacct.ext4_io_submit 5.63 ± 8% +3.2 8.81 ± 4% perf-profile.calltrace.cycles-pp._raw_spin_lock.brd_insert_page.brd_submit_bio.__submit_bio.__submit_bio_noacct 8.48 ± 7% +4.4 12.91 ± 3% perf-profile.calltrace.cycles-pp.brd_insert_page.brd_submit_bio.__submit_bio.__submit_bio_noacct.ext4_io_submit 17.80 ± 4% +5.1 22.94 perf-profile.calltrace.cycles-pp.wb_do_writeback.wb_workfn.process_one_work.worker_thread.kthread 17.80 ± 4% +5.1 22.94 perf-profile.calltrace.cycles-pp.wb_workfn.process_one_work.worker_thread.kthread.ret_from_fork 17.80 ± 4% +5.1 22.94 perf-profile.calltrace.cycles-pp.wb_writeback.wb_do_writeback.wb_workfn.process_one_work.worker_thread 17.72 ± 4% +5.2 22.93 perf-profile.calltrace.cycles-pp.ext4_do_writepages.ext4_writepages.do_writepages.__writeback_single_inode.writeback_sb_inodes 12.77 ± 5% +6.6 19.36 perf-profile.calltrace.cycles-pp.brd_submit_bio.__submit_bio.__submit_bio_noacct.ext4_io_submit.ext4_do_writepages 12.77 ± 5% +6.6 19.37 perf-profile.calltrace.cycles-pp.__submit_bio.__submit_bio_noacct.ext4_io_submit.ext4_do_writepages.ext4_writepages 12.77 ± 5% +6.6 19.37 perf-profile.calltrace.cycles-pp.__submit_bio_noacct.ext4_io_submit.ext4_do_writepages.ext4_writepages.do_writepages 12.77 ± 5% +6.6 19.37 perf-profile.calltrace.cycles-pp.ext4_io_submit.ext4_do_writepages.ext4_writepages.do_writepages.__writeback_single_inode 23.05 ± 4% +6.7 29.71 perf-profile.calltrace.cycles-pp.kthread.ret_from_fork.ret_from_fork_asm 22.96 ± 4% +6.7 29.61 perf-profile.calltrace.cycles-pp.process_one_work.worker_thread.kthread.ret_from_fork.ret_from_fork_asm 23.05 ± 4% +6.7 29.71 perf-profile.calltrace.cycles-pp.ret_from_fork.ret_from_fork_asm 23.05 ± 4% +6.7 29.71 perf-profile.calltrace.cycles-pp.ret_from_fork_asm 22.98 ± 4% +6.7 29.64 perf-profile.calltrace.cycles-pp.worker_thread.kthread.ret_from_fork.ret_from_fork_asm 0.48 ±110% +9.0 9.44 ± 2% perf-profile.calltrace.cycles-pp.ext4_writepages.do_writepages.__writeback_single_inode.writeback_sb_inodes.wb_writeback 0.48 ±110% +9.0 9.44 ± 2% perf-profile.calltrace.cycles-pp.__writeback_single_inode.writeback_sb_inodes.wb_writeback.wb_do_writeback.wb_workfn 0.48 ±110% +9.0 9.44 ± 2% perf-profile.calltrace.cycles-pp.do_writepages.__writeback_single_inode.writeback_sb_inodes.wb_writeback.wb_do_writeback 0.48 ±110% +9.0 9.44 ± 2% perf-profile.calltrace.cycles-pp.writeback_sb_inodes.wb_writeback.wb_do_writeback.wb_workfn.process_one_work 24.59 ± 5% -7.7 16.86 perf-profile.children.cycles-pp.ext4_buffered_write_iter 24.41 ± 5% -7.7 16.69 perf-profile.children.cycles-pp.generic_perform_write 31.13 -6.6 24.56 ± 2% perf-profile.children.cycles-pp.entry_SYSCALL_64_after_hwframe 31.10 -6.6 24.53 ± 2% perf-profile.children.cycles-pp.do_syscall_64 30.67 -6.4 24.32 perf-profile.children.cycles-pp.write 29.40 -6.3 23.14 perf-profile.children.cycles-pp.vfs_write 29.51 -6.3 23.26 perf-profile.children.cycles-pp.ksys_write 6.32 ± 3% -5.1 1.24 ± 6% perf-profile.children.cycles-pp.ext4_da_do_write_end 5.86 ± 3% -4.7 1.17 ± 6% perf-profile.children.cycles-pp.block_write_end 5.80 ± 3% -4.6 1.16 ± 6% perf-profile.children.cycles-pp.__block_commit_write 9.67 ± 5% -4.0 5.62 ± 2% perf-profile.children.cycles-pp.ext4_da_write_begin 17.24 ± 5% -3.7 13.50 perf-profile.children.cycles-pp.__writeback_inodes_wb 4.94 ± 6% -3.1 1.88 ± 2% perf-profile.children.cycles-pp.__filemap_get_folio 2.62 ± 5% -1.7 0.95 ± 4% perf-profile.children.cycles-pp.filemap_add_folio 1.86 ± 7% -1.0 0.82 ± 6% perf-profile.children.cycles-pp.folio_alloc_noprof 4.54 ± 5% -0.9 3.68 ± 3% perf-profile.children.cycles-pp.ext4_block_write_begin 2.11 ± 8% -0.8 1.35 ± 7% perf-profile.children.cycles-pp.mpage_map_and_submit_buffers 1.15 ± 6% -0.7 0.41 ± 11% perf-profile.children.cycles-pp.__folio_batch_add_and_move 1.31 ± 3% -0.7 0.58 ± 6% perf-profile.children.cycles-pp.mark_buffer_dirty 1.34 ± 7% -0.7 0.61 ± 9% perf-profile.children.cycles-pp.folio_end_writeback 2.17 ± 8% -0.7 1.46 ± 7% perf-profile.children.cycles-pp.mpage_map_and_submit_extent 1.02 ± 3% -0.7 0.33 ± 10% perf-profile.children.cycles-pp.__folio_mark_dirty 2.00 ± 5% -0.7 1.32 ± 10% perf-profile.children.cycles-pp.ext4_finish_bio 2.00 ± 5% -0.7 1.32 ± 10% perf-profile.children.cycles-pp.ext4_release_io_end 2.03 ± 6% -0.7 1.36 ± 9% perf-profile.children.cycles-pp.ext4_end_io_end 1.61 ± 7% -0.7 0.94 ± 8% perf-profile.children.cycles-pp.mpage_submit_folio 1.04 ± 5% -0.6 0.39 ± 11% perf-profile.children.cycles-pp.folio_batch_move_lru 1.07 ± 8% -0.6 0.44 ± 6% perf-profile.children.cycles-pp.__filemap_add_folio 0.80 ± 5% -0.6 0.21 ± 7% perf-profile.children.cycles-pp.lru_add 1.11 ± 6% -0.6 0.52 ± 9% perf-profile.children.cycles-pp.__folio_end_writeback 1.84 ± 7% -0.5 1.30 ± 7% perf-profile.children.cycles-pp.create_empty_buffers 0.99 ± 5% -0.5 0.45 ± 5% perf-profile.children.cycles-pp.__lruvec_stat_mod_folio 0.68 ± 8% -0.4 0.27 ± 11% perf-profile.children.cycles-pp.__folio_start_writeback 0.66 ± 7% -0.4 0.27 ± 8% perf-profile.children.cycles-pp.__mod_memcg_lruvec_state 0.53 ± 7% -0.4 0.14 ± 11% perf-profile.children.cycles-pp.lru_gen_add_folio 1.50 ± 7% -0.4 1.14 ± 7% perf-profile.children.cycles-pp.rmqueue 1.09 ± 7% -0.3 0.75 ± 9% perf-profile.children.cycles-pp.ext4_bio_write_folio 0.56 ± 5% -0.3 0.23 ± 11% perf-profile.children.cycles-pp.folio_account_dirtied 0.48 ± 7% -0.3 0.18 ± 7% perf-profile.children.cycles-pp.folio_clear_dirty_for_io 1.12 ± 6% -0.3 0.82 ± 9% perf-profile.children.cycles-pp.__rmqueue_pcplist 0.36 ± 10% -0.3 0.09 ± 4% perf-profile.children.cycles-pp.fault_in_iov_iter_readable 0.33 ± 10% -0.2 0.08 ± 8% perf-profile.children.cycles-pp.fault_in_readable 0.33 ± 6% -0.2 0.09 ± 14% perf-profile.children.cycles-pp.__mem_cgroup_charge 0.28 ± 8% -0.2 0.07 ± 17% perf-profile.children.cycles-pp.filemap_get_entry 0.27 ± 5% -0.2 0.06 ± 19% perf-profile.children.cycles-pp.__xa_set_mark 1.40 ± 8% -0.2 1.19 ± 7% perf-profile.children.cycles-pp.folio_alloc_buffers 0.28 ± 9% -0.2 0.08 ± 8% perf-profile.children.cycles-pp.node_dirty_ok 0.47 ± 10% -0.2 0.28 ± 13% perf-profile.children.cycles-pp.percpu_counter_add_batch 1.00 ± 8% -0.2 0.83 ± 3% perf-profile.children.cycles-pp.xas_load 0.21 ± 11% -0.1 0.08 perf-profile.children.cycles-pp.__mod_node_page_state 0.76 ± 8% -0.1 0.64 ± 8% perf-profile.children.cycles-pp.rmqueue_bulk 0.20 ± 9% -0.1 0.09 ± 18% perf-profile.children.cycles-pp._raw_spin_lock_irq 0.16 ± 11% -0.1 0.06 ± 48% perf-profile.children.cycles-pp.__mark_inode_dirty 0.12 ± 16% -0.1 0.02 ± 99% perf-profile.children.cycles-pp.xas_find_conflict 0.25 ± 11% -0.1 0.16 ± 10% perf-profile.children.cycles-pp.balance_dirty_pages_ratelimited_flags 0.12 ± 17% -0.1 0.02 ± 99% perf-profile.children.cycles-pp.mod_zone_page_state 0.14 ± 9% -0.1 0.06 ± 15% perf-profile.children.cycles-pp.charge_memcg 0.15 ± 9% -0.1 0.06 ± 11% perf-profile.children.cycles-pp.cgroup_rstat_updated 0.16 ± 12% -0.1 0.08 ± 12% perf-profile.children.cycles-pp.ext4_da_write_end 0.22 ± 13% -0.1 0.14 ± 10% perf-profile.children.cycles-pp.xas_start 0.35 ± 10% -0.1 0.28 ± 8% perf-profile.children.cycles-pp.allocate_slab 0.12 ± 10% -0.1 0.05 ± 47% perf-profile.children.cycles-pp.try_charge_memcg 0.12 ± 11% -0.1 0.05 ± 7% perf-profile.children.cycles-pp.__mod_zone_page_state 0.12 ± 18% -0.1 0.06 ± 18% perf-profile.children.cycles-pp.__fprop_add_percpu 0.57 ± 9% -0.1 0.51 ± 6% perf-profile.children.cycles-pp.__memcg_slab_post_alloc_hook 0.63 ± 2% -0.1 0.58 ± 4% perf-profile.children.cycles-pp.ext4_es_lookup_extent 0.12 ± 15% -0.1 0.07 ± 16% perf-profile.children.cycles-pp._raw_spin_unlock_irqrestore 0.08 ± 20% -0.1 0.03 ±100% perf-profile.children.cycles-pp.xas_find_marked 0.20 ± 12% -0.0 0.16 ± 9% perf-profile.children.cycles-pp.__cond_resched 0.12 ± 10% -0.0 0.08 ± 11% perf-profile.children.cycles-pp.policy_nodemask 0.14 ± 5% -0.0 0.11 ± 8% perf-profile.children.cycles-pp.up_write 0.08 ± 8% -0.0 0.06 ± 15% perf-profile.children.cycles-pp.rcu_all_qs 0.10 ± 13% +0.0 0.12 ± 6% perf-profile.children.cycles-pp.vfs_read 0.10 ± 15% +0.0 0.13 ± 8% perf-profile.children.cycles-pp.ksys_read 0.07 ± 15% +0.0 0.11 ± 24% perf-profile.children.cycles-pp.ext4_ext_map_blocks 0.07 ± 14% +0.1 0.12 ± 18% perf-profile.children.cycles-pp.ext4_map_create_blocks 0.08 ± 8% +0.1 0.14 ± 19% perf-profile.children.cycles-pp.ext4_map_blocks 0.01 ±223% +0.1 0.07 ± 33% perf-profile.children.cycles-pp.ext4_mb_new_blocks 0.30 ± 10% +0.1 0.43 ± 8% perf-profile.children.cycles-pp.__xa_insert 0.37 ± 12% +0.1 0.51 ± 9% perf-profile.children.cycles-pp.xa_load 0.52 ± 29% +0.2 0.68 ± 5% perf-profile.children.cycles-pp.io_serial_out 1.40 ± 10% +0.6 1.98 ± 4% perf-profile.children.cycles-pp.clear_page_erms 0.56 ± 31% +0.6 1.17 ± 24% perf-profile.children.cycles-pp.tmigr_requires_handle_remote 0.54 ± 34% +0.6 1.15 ± 24% perf-profile.children.cycles-pp.get_jiffies_update 1.98 ± 12% +0.8 2.80 ± 10% perf-profile.children.cycles-pp.update_process_times 4.39 ± 8% +0.9 5.31 ± 9% perf-profile.children.cycles-pp.__sysvec_apic_timer_interrupt 4.36 ± 8% +0.9 5.29 ± 9% perf-profile.children.cycles-pp.hrtimer_interrupt 2.53 ± 10% +1.0 3.53 ± 12% perf-profile.children.cycles-pp.tick_nohz_handler 3.44 ± 8% +1.0 4.47 ± 10% perf-profile.children.cycles-pp.__hrtimer_run_queues 2.35 ± 5% +1.0 3.39 ± 3% perf-profile.children.cycles-pp.alloc_pages_noprof 4.03 ± 29% +1.3 5.28 ± 5% perf-profile.children.cycles-pp.wait_for_lsr 4.55 ± 29% +1.4 5.97 ± 5% perf-profile.children.cycles-pp.serial8250_console_write 4.84 ± 29% +1.5 6.32 ± 4% perf-profile.children.cycles-pp.console_flush_all 4.84 ± 29% +1.5 6.32 ± 4% perf-profile.children.cycles-pp.console_unlock 4.85 ± 29% +1.5 6.32 ± 4% perf-profile.children.cycles-pp.vprintk_emit 4.68 ± 30% +1.5 6.17 ± 4% perf-profile.children.cycles-pp.devkmsg_write 4.68 ± 30% +1.5 6.17 ± 4% perf-profile.children.cycles-pp.devkmsg_emit 5.02 ± 32% +1.5 6.52 perf-profile.children.cycles-pp.memcpy_toio 5.06 ± 32% +1.5 6.59 perf-profile.children.cycles-pp.drm_fb_memcpy 5.06 ± 32% +1.5 6.59 perf-profile.children.cycles-pp.ast_primary_plane_helper_atomic_update 5.07 ± 32% +1.5 6.60 perf-profile.children.cycles-pp.drm_atomic_commit 5.07 ± 32% +1.5 6.59 perf-profile.children.cycles-pp.drm_atomic_helper_commit_planes 5.07 ± 32% +1.5 6.60 perf-profile.children.cycles-pp.drm_atomic_helper_commit_tail 5.07 ± 32% +1.5 6.60 perf-profile.children.cycles-pp.ast_mode_config_helper_atomic_commit_tail 5.07 ± 32% +1.5 6.60 perf-profile.children.cycles-pp.commit_tail 5.07 ± 32% +1.5 6.60 perf-profile.children.cycles-pp.drm_atomic_helper_commit 5.07 ± 32% +1.5 6.60 perf-profile.children.cycles-pp.drm_atomic_helper_dirtyfb 5.07 ± 32% +1.5 6.60 perf-profile.children.cycles-pp.drm_fb_helper_damage_work 5.07 ± 32% +1.5 6.60 perf-profile.children.cycles-pp.drm_fbdev_shmem_helper_fb_dirty 7.25 ± 9% +1.9 9.13 ± 2% perf-profile.children.cycles-pp.rep_movs_alternative 7.50 ± 9% +1.9 9.40 ± 2% perf-profile.children.cycles-pp.copy_page_from_iter_atomic 3.96 ± 7% +2.0 6.00 ± 7% perf-profile.children.cycles-pp.memcpy_orig 4.18 ± 7% +2.1 6.30 ± 6% perf-profile.children.cycles-pp.copy_to_brd 6.53 ± 7% +2.8 9.36 ± 3% perf-profile.children.cycles-pp._raw_spin_lock 8.66 ± 6% +4.3 12.93 ± 3% perf-profile.children.cycles-pp.brd_insert_page 17.79 ± 4% +5.1 22.93 perf-profile.children.cycles-pp.__writeback_single_inode 17.79 ± 4% +5.1 22.93 perf-profile.children.cycles-pp.ext4_writepages 17.79 ± 4% +5.1 22.93 perf-profile.children.cycles-pp.do_writepages 17.79 ± 4% +5.1 22.93 perf-profile.children.cycles-pp.ext4_do_writepages 17.80 ± 4% +5.1 22.94 perf-profile.children.cycles-pp.writeback_sb_inodes 17.80 ± 4% +5.1 22.94 perf-profile.children.cycles-pp.wb_do_writeback 17.80 ± 4% +5.1 22.94 perf-profile.children.cycles-pp.wb_workfn 17.80 ± 4% +5.1 22.94 perf-profile.children.cycles-pp.wb_writeback 12.92 ± 4% +6.4 19.36 perf-profile.children.cycles-pp.brd_submit_bio 12.92 ± 4% +6.4 19.37 perf-profile.children.cycles-pp.__submit_bio 12.92 ± 4% +6.4 19.37 perf-profile.children.cycles-pp.__submit_bio_noacct 12.89 ± 4% +6.5 19.37 perf-profile.children.cycles-pp.ext4_io_submit 23.05 ± 4% +6.7 29.71 perf-profile.children.cycles-pp.kthread 22.96 ± 4% +6.7 29.61 perf-profile.children.cycles-pp.process_one_work 23.06 ± 4% +6.7 29.71 perf-profile.children.cycles-pp.ret_from_fork_asm 23.05 ± 4% +6.7 29.71 perf-profile.children.cycles-pp.ret_from_fork 22.98 ± 4% +6.7 29.64 perf-profile.children.cycles-pp.worker_thread 4.33 ± 4% -3.8 0.53 ± 8% perf-profile.self.cycles-pp.__block_commit_write 0.96 ± 6% -0.4 0.57 ± 8% perf-profile.self.cycles-pp._raw_spin_lock_irqsave 0.52 ± 8% -0.3 0.21 ± 10% perf-profile.self.cycles-pp.__mod_memcg_lruvec_state 0.37 ± 6% -0.3 0.10 ± 8% perf-profile.self.cycles-pp.lru_gen_add_folio 0.34 ± 6% -0.3 0.07 ± 5% perf-profile.self.cycles-pp.__filemap_add_folio 0.32 ± 8% -0.3 0.06 ± 45% perf-profile.self.cycles-pp.ext4_da_do_write_end 0.33 ± 9% -0.2 0.08 ± 10% perf-profile.self.cycles-pp.fault_in_readable 0.33 ± 11% -0.2 0.12 ± 10% perf-profile.self.cycles-pp.__folio_end_writeback 0.26 ± 8% -0.2 0.07 ± 20% perf-profile.self.cycles-pp.lru_add 0.36 ± 8% -0.2 0.18 ± 16% perf-profile.self.cycles-pp.__rmqueue_pcplist 0.37 ± 10% -0.2 0.19 ± 18% perf-profile.self.cycles-pp.percpu_counter_add_batch 0.24 ± 7% -0.2 0.07 ± 14% perf-profile.self.cycles-pp.create_empty_buffers 0.69 ± 7% -0.2 0.53 ± 6% perf-profile.self.cycles-pp.rmqueue_bulk 0.24 ± 9% -0.2 0.08 ± 8% perf-profile.self.cycles-pp.__folio_start_writeback 0.27 ± 7% -0.1 0.12 ± 8% perf-profile.self.cycles-pp.ext4_block_write_begin 0.22 ± 9% -0.1 0.08 ± 12% perf-profile.self.cycles-pp.folio_clear_dirty_for_io 0.26 ± 9% -0.1 0.12 ± 13% perf-profile.self.cycles-pp.folios_put_refs 0.22 ± 12% -0.1 0.08 ± 8% perf-profile.self.cycles-pp.folio_end_writeback 0.29 ± 7% -0.1 0.15 ± 5% perf-profile.self.cycles-pp.__lruvec_stat_mod_folio 0.19 ± 11% -0.1 0.06 ± 9% perf-profile.self.cycles-pp.node_dirty_ok 0.16 ± 4% -0.1 0.05 ± 49% perf-profile.self.cycles-pp.ext4_da_write_begin 0.18 ± 11% -0.1 0.07 ± 5% perf-profile.self.cycles-pp.__mod_node_page_state 0.20 ± 7% -0.1 0.08 ± 17% perf-profile.self.cycles-pp._raw_spin_lock_irq 0.16 ± 13% -0.1 0.07 ± 12% perf-profile.self.cycles-pp.ext4_da_write_end 0.32 ± 12% -0.1 0.23 ± 15% perf-profile.self.cycles-pp.get_page_from_freelist 0.11 ± 8% -0.1 0.02 ± 99% perf-profile.self.cycles-pp.__mod_zone_page_state 0.14 ± 8% -0.1 0.06 ± 19% perf-profile.self.cycles-pp.mpage_prepare_extent_to_map 0.14 ± 9% -0.1 0.06 ± 11% perf-profile.self.cycles-pp.cgroup_rstat_updated 0.21 ± 13% -0.1 0.13 ± 10% perf-profile.self.cycles-pp.xas_start 0.14 ± 37% -0.1 0.06 ± 21% perf-profile.self.cycles-pp.folio_alloc_buffers 0.34 ± 9% -0.1 0.27 ± 11% perf-profile.self.cycles-pp.__alloc_frozen_pages_noprof 0.16 ± 19% -0.1 0.09 ± 12% perf-profile.self.cycles-pp.generic_perform_write 0.08 ± 20% -0.1 0.03 ±100% perf-profile.self.cycles-pp.xas_find_marked 0.11 ± 18% -0.0 0.06 ± 19% perf-profile.self.cycles-pp._raw_spin_unlock_irqrestore 0.20 ± 11% -0.0 0.16 ± 6% perf-profile.self.cycles-pp._raw_spin_trylock 0.14 ± 7% -0.0 0.11 ± 8% perf-profile.self.cycles-pp.up_write 0.05 ± 48% +0.0 0.08 ± 21% perf-profile.self.cycles-pp.bvec_try_merge_page 0.04 ± 71% +0.0 0.08 ± 10% perf-profile.self.cycles-pp.alloc_pages_noprof 0.04 ± 71% +0.0 0.08 ± 22% perf-profile.self.cycles-pp.__xa_insert 0.03 ±101% +0.1 0.09 ± 17% perf-profile.self.cycles-pp.update_process_times 0.09 ± 17% +0.1 0.14 ± 13% perf-profile.self.cycles-pp.brd_submit_bio 0.20 ± 9% +0.1 0.27 ± 10% perf-profile.self.cycles-pp.ext4_bio_write_folio 0.00 +0.1 0.13 ± 8% perf-profile.self.cycles-pp.folio_alloc_noprof 0.52 ± 29% +0.2 0.68 ± 5% perf-profile.self.cycles-pp.io_serial_out 0.29 ± 36% +0.2 0.51 ± 26% perf-profile.self.cycles-pp.tick_nohz_handler 0.20 ± 8% +0.3 0.47 ± 14% perf-profile.self.cycles-pp.ext4_finish_bio 1.39 ± 10% +0.6 1.96 ± 3% perf-profile.self.cycles-pp.clear_page_erms 0.54 ± 34% +0.6 1.15 ± 24% perf-profile.self.cycles-pp.get_jiffies_update 4.90 ± 32% +1.4 6.34 perf-profile.self.cycles-pp.memcpy_toio 7.20 ± 9% +1.9 9.08 ± 2% perf-profile.self.cycles-pp.rep_movs_alternative 3.94 ± 7% +2.0 5.96 ± 6% perf-profile.self.cycles-pp.memcpy_orig 6.48 ± 7% +2.8 9.28 ± 3% perf-profile.self.cycles-pp._raw_spin_lock Disclaimer: Results have been estimated based on internal Intel analysis and are provided for informational purposes only. Any difference in system hardware or software design or configuration may affect actual performance.
On Mon, May 12, 2025 at 02:33:19PM +0800, Zhang Yi wrote: > From: Zhang Yi <yi.zhang@huawei.com> > > Besides fsverity, fscrypt, and the data=journal mode, ext4 now supports > large folios for regular files. Enable this feature by default. However, > since we cannot change the folio order limitation of mappings on active > inodes, setting the journal=data mode via ioctl on an active inode will > not take immediate effect in non-delalloc mode. > Looks good: Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com> Thanks, Ojaswin > Signed-off-by: Zhang Yi <yi.zhang@huawei.com> > --- > fs/ext4/ext4.h | 1 + > fs/ext4/ext4_jbd2.c | 3 ++- > fs/ext4/ialloc.c | 3 +++ > fs/ext4/inode.c | 20 ++++++++++++++++++++ > 4 files changed, 26 insertions(+), 1 deletion(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 5a20e9cd7184..2fad90c30493 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -2993,6 +2993,7 @@ int ext4_walk_page_buffers(handle_t *handle, > struct buffer_head *bh)); > int do_journal_get_write_access(handle_t *handle, struct inode *inode, > struct buffer_head *bh); > +bool ext4_should_enable_large_folio(struct inode *inode); > #define FALL_BACK_TO_NONDELALLOC 1 > #define CONVERT_INLINE_DATA 2 > > diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c > index 135e278c832e..b3e9b7bd7978 100644 > --- a/fs/ext4/ext4_jbd2.c > +++ b/fs/ext4/ext4_jbd2.c > @@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode) > ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || > test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || > (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && > - !test_opt(inode->i_sb, DELALLOC))) { > + !test_opt(inode->i_sb, DELALLOC) && > + !mapping_large_folio_support(inode->i_mapping))) { > /* We do not support data journalling for encrypted data */ > if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) > return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ > diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c > index e7ecc7c8a729..4938e78cbadc 100644 > --- a/fs/ext4/ialloc.c > +++ b/fs/ext4/ialloc.c > @@ -1336,6 +1336,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, > } > } > > + if (ext4_should_enable_large_folio(inode)) > + mapping_set_large_folios(inode->i_mapping); > + > ext4_update_inode_fsync_trans(handle, inode, 1); > > err = ext4_mark_inode_dirty(handle, inode); > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index 29eccdf8315a..7fd3921cfe46 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -4774,6 +4774,23 @@ static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, > return -EFSCORRUPTED; > } > > +bool ext4_should_enable_large_folio(struct inode *inode) > +{ > + struct super_block *sb = inode->i_sb; > + > + if (!S_ISREG(inode->i_mode)) > + return false; > + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || > + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) > + return false; > + if (ext4_has_feature_verity(sb)) > + return false; > + if (ext4_has_feature_encrypt(sb)) > + return false; > + > + return true; > +} > + > struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, > ext4_iget_flags flags, const char *function, > unsigned int line) > @@ -5096,6 +5113,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, > ret = -EFSCORRUPTED; > goto bad_inode; > } > + if (ext4_should_enable_large_folio(inode)) > + mapping_set_large_folios(inode->i_mapping); > + > ret = check_igot_inode(inode, flags, function, line); > /* > * -ESTALE here means there is nothing inherently wrong with the inode, > -- > 2.46.1 >
Hi Zhang Yi, Greetings! I used Syzkaller and found that there is general protection fault in try_to_unmap_one in linux-next next-20250623. After bisection and the first bad commit is: " 7ac67301e82f ext4: enable large folio for regular file " All detailed into can be found at: https://github.com/laifryiee/syzkaller_logs/tree/main/250624_222435_try_to_unmap_one Syzkaller repro code: https://github.com/laifryiee/syzkaller_logs/tree/main/250624_222435_try_to_unmap_one/repro.c Syzkaller repro syscall steps: https://github.com/laifryiee/syzkaller_logs/tree/main/250624_222435_try_to_unmap_one/repro.prog Syzkaller report: https://github.com/laifryiee/syzkaller_logs/tree/main/250624_222435_try_to_unmap_one/repro.report Kconfig(make olddefconfig): https://github.com/laifryiee/syzkaller_logs/tree/main/250624_222435_try_to_unmap_one/kconfig_origin Bisect info: https://github.com/laifryiee/syzkaller_logs/tree/main/250624_222435_try_to_unmap_one/bisect_info.log bzImage: https://github.com/laifryiee/syzkaller_logs/raw/refs/heads/main/250624_222435_try_to_unmap_one/bzImage_86731a2a651e58953fc949573895f2fa6d456841 Issue dmesg: https://github.com/laifryiee/syzkaller_logs/blob/main/250624_222435_try_to_unmap_one/86731a2a651e58953fc949573895f2fa6d456841_dmesg.log " [ 48.166741] Injecting memory failure for pfn 0x28c00 at process virtual address 0x20ffc000 [ 48.167878] Memory failure: 0x28c00: Sending SIGBUS to repro:668 due to hardware memory corruption [ 48.169079] Memory failure: 0x28c00: recovery action for unsplit thp: Failed [ 48.657334] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] SMP KASI [ 48.658081] KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] [ 48.658561] CPU: 0 UID: 0 PID: 675 Comm: repro Not tainted 6.16.0-rc3-86731a2a651e #1 PREEMPT(voluntary) [ 48.659153] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org4 [ 48.659862] RIP: 0010:try_to_unmap_one+0x4ef/0x3860 [ 48.660204] Code: f5 a5 ff 48 8b 9d 78 ff ff ff 49 8d 46 18 48 89 85 70 fe ff ff 48 85 db 0f 84 96 1a 00 00 e8 c8 f58 [ 48.661345] RSP: 0018:ffff88801a55ebc0 EFLAGS: 00010246 [ 48.661685] RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff81e1a1a1 [ 48.662136] RDX: ffff888014502540 RSI: ffffffff81e186c8 RDI: 0000000000000005 [ 48.662584] RBP: ffff88801a55ed90 R08: 0000000000000001 R09: ffffed10034abd3b [ 48.663030] R10: 0000000000000000 R11: ffff888014503398 R12: 0000000020e00000 [ 48.663490] R13: ffffea0000a30000 R14: ffffea0000a30000 R15: dffffc0000000000 [ 48.663950] FS: 00007f2e4c104740(0000) GS:ffff8880e3562000(0000) knlGS:0000000000000000 [ 48.664464] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 48.664836] CR2: 0000000021000000 CR3: 00000000115ae003 CR4: 0000000000770ef0 [ 48.665297] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 48.665756] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400 [ 48.666210] PKRU: 55555554 [ 48.666398] Call Trace: [ 48.666569] <TASK> [ 48.666729] ? __pfx_try_to_unmap_one+0x10/0x10 [ 48.667048] __rmap_walk_file+0x2a5/0x4a0 [ 48.667324] rmap_walk+0x16b/0x1f0 [ 48.667563] try_to_unmap+0x12f/0x140 [ 48.667818] ? __pfx_try_to_unmap+0x10/0x10 [ 48.668104] ? __pfx_try_to_unmap_one+0x10/0x10 [ 48.668408] ? __pfx_folio_not_mapped+0x10/0x10 [ 48.668713] ? __pfx_folio_lock_anon_vma_read+0x10/0x10 [ 48.669066] ? __sanitizer_cov_trace_const_cmp8+0x1c/0x30 [ 48.669438] unmap_poisoned_folio+0x130/0x500 [ 48.669743] shrink_folio_list+0x44f/0x3d90 [ 48.670036] ? stack_depot_save_flags+0x445/0xa40 [ 48.670366] ? __this_cpu_preempt_check+0x21/0x30 [ 48.670711] ? lockdep_hardirqs_on+0x89/0x110 [ 48.671014] ? __pfx_shrink_folio_list+0x10/0x10 [ 48.671325] ? is_bpf_text_address+0x94/0x1b0 [ 48.671628] ? debug_smp_processor_id+0x20/0x30 [ 48.671937] ? is_bpf_text_address+0x9e/0x1b0 [ 48.672232] ? kernel_text_address+0xd3/0xe0 [ 48.672538] ? __kernel_text_address+0x16/0x50 [ 48.672845] ? unwind_get_return_address+0x65/0xb0 [ 48.673178] ? __pfx_stack_trace_consume_entry+0x10/0x10 [ 48.673540] ? arch_stack_walk+0xa1/0xf0 [ 48.673826] reclaim_folio_list+0xe2/0x4c0 [ 48.674104] ? check_path.constprop.0+0x28/0x50 [ 48.674422] ? __pfx_reclaim_folio_list+0x10/0x10 [ 48.674750] ? folio_isolate_lru+0x38c/0x590 [ 48.675047] reclaim_pages+0x393/0x560 [ 48.675306] ? __pfx_reclaim_pages+0x10/0x10 [ 48.675605] ? do_raw_spin_unlock+0x15c/0x210 [ 48.675900] madvise_cold_or_pageout_pte_range+0x1cac/0x2800 [ 48.676287] ? __pfx_madvise_cold_or_pageout_pte_range+0x10/0x10 [ 48.676687] ? lock_is_held_type+0xef/0x150 [ 48.676975] ? __pfx_madvise_cold_or_pageout_pte_range+0x10/0x10 [ 48.677372] walk_pgd_range+0xe2d/0x2420 [ 48.677654] ? __pfx_walk_pgd_range+0x10/0x10 [ 48.677955] __walk_page_range+0x177/0x810 [ 48.678236] ? find_vma+0xc4/0x140 [ 48.678478] ? __pfx_find_vma+0x10/0x10 [ 48.678746] ? __this_cpu_preempt_check+0x21/0x30 [ 48.679062] ? __sanitizer_cov_trace_const_cmp8+0x1c/0x30 [ 48.679428] walk_page_range_mm+0x39f/0x770 [ 48.679718] ? __pfx_walk_page_range_mm+0x10/0x10 [ 48.680038] ? __this_cpu_preempt_check+0x21/0x30 [ 48.680355] ? __sanitizer_cov_trace_const_cmp4+0x1a/0x20 [ 48.680713] ? mlock_drain_local+0x27f/0x4b0 [ 48.681006] walk_page_range+0x70/0xa0 [ 48.681263] ? __kasan_check_write+0x18/0x20 [ 48.681562] madvise_do_behavior+0x13e3/0x35f0 [ 48.681874] ? copy_vma_and_data+0x353/0x7d0 [ 48.682169] ? __pfx_madvise_do_behavior+0x10/0x10 [ 48.682497] ? __pfx_arch_get_unmapped_area_topdown+0x10/0x10 [ 48.682885] ? __this_cpu_preempt_check+0x21/0x30 [ 48.683203] ? lock_is_held_type+0xef/0x150 [ 48.683494] ? __lock_acquire+0x412/0x22a0 [ 48.683789] ? __this_cpu_preempt_check+0x21/0x30 [ 48.684108] ? lock_acquire+0x180/0x310 [ 48.684381] ? __pfx_down_read+0x10/0x10 [ 48.684656] ? __lock_acquire+0x412/0x22a0 [ 48.684953] ? __pfx___do_sys_mremap+0x10/0x10 [ 48.685257] ? __sanitizer_cov_trace_switch+0x58/0xa0 [ 48.685603] do_madvise+0x193/0x2b0 [ 48.685852] ? do_madvise+0x193/0x2b0 [ 48.686122] ? __pfx_do_madvise+0x10/0x10 [ 48.686401] ? __this_cpu_preempt_check+0x21/0x30 [ 48.686715] ? seqcount_lockdep_reader_access.constprop.0+0xb4/0xd0 [ 48.687154] ? lockdep_hardirqs_on+0x89/0x110 [ 48.687457] ? trace_hardirqs_on+0x51/0x60 [ 48.687751] ? seqcount_lockdep_reader_access.constprop.0+0xc0/0xd0 [ 48.688162] ? __sanitizer_cov_trace_cmp4+0x1a/0x20 [ 48.688492] ? ktime_get_coarse_real_ts64+0xad/0xf0 [ 48.688823] ? __audit_syscall_entry+0x39c/0x500 [ 48.689134] __x64_sys_madvise+0xb2/0x120 [ 48.689411] ? syscall_trace_enter+0x14d/0x280 [ 48.689720] x64_sys_call+0x19ac/0x2150 [ 48.689987] do_syscall_64+0x6d/0x2e0 [ 48.690248] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 48.690583] RIP: 0033:0x7f2e4be3ee5d [ 48.690842] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d8 [ 48.692016] RSP: 002b:00007ffeb3fe8e68 EFLAGS: 00000217 ORIG_RAX: 000000000000001c [ 48.692503] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f2e4be3ee5d [ 48.692978] RDX: 0000000000000015 RSI: 0000000000c00000 RDI: 0000000020400000 [ 48.693435] RBP: 00007ffeb3fe8e80 R08: 00007ffeb3fe8e80 R09: 00007ffeb3fe8e80 [ 48.693886] R10: 0000000020fc6000 R11: 0000000000000217 R12: 00007ffeb3fe8fd8 [ 48.694344] R13: 00000000004018e5 R14: 0000000000403e08 R15: 00007f2e4c151000 [ 48.694811] </TASK> [ 48.694967] Modules linked in: [ 48.695320] ---[ end trace 0000000000000000 ]--- " Hope this cound be insightful to you. Regards, Yi Lai --- If you don't need the following environment to reproduce the problem or if you already have one reproduced environment, please ignore the following information. How to reproduce: git clone https://gitlab.com/xupengfe/repro_vm_env.git cd repro_vm_env tar -xvf repro_vm_env.tar.gz cd repro_vm_env; ./start3.sh // it needs qemu-system-x86_64 and I used v7.1.0 // start3.sh will load bzImage_2241ab53cbb5cdb08a6b2d4688feb13971058f65 v6.2-rc5 kernel // You could change the bzImage_xxx as you want // Maybe you need to remove line "-drive if=pflash,format=raw,readonly=on,file=./OVMF_CODE.fd \" for different qemu version You could use below command to log in, there is no password for root. ssh -p 10023 root@localhost After login vm(virtual machine) successfully, you could transfer reproduced binary to the vm by below way, and reproduce the problem in vm: gcc -pthread -o repro repro.c scp -P 10023 repro root@localhost:/root/ Get the bzImage for target kernel: Please use target kconfig and copy it to kernel_src/.config make olddefconfig make -jx bzImage //x should equal or less than cpu num your pc has Fill the bzImage file into above start3.sh to load the target kernel in vm. Tips: If you already have qemu-system-x86_64, please ignore below info. If you want to install qemu v7.1.0 version: git clone https://github.com/qemu/qemu.git cd qemu git checkout -f v7.1.0 mkdir build cd build yum install -y ninja-build.x86_64 yum -y install libslirp-devel.x86_64 ../configure --target-list=x86_64-softmmu --enable-kvm --enable-vnc --enable-gtk --enable-sdl --enable-usb-redir --enable-slirp make make install On Mon, May 12, 2025 at 02:33:19PM +0800, Zhang Yi wrote: > From: Zhang Yi <yi.zhang@huawei.com> > > Besides fsverity, fscrypt, and the data=journal mode, ext4 now supports > large folios for regular files. Enable this feature by default. However, > since we cannot change the folio order limitation of mappings on active > inodes, setting the journal=data mode via ioctl on an active inode will > not take immediate effect in non-delalloc mode. > > Signed-off-by: Zhang Yi <yi.zhang@huawei.com> > --- > fs/ext4/ext4.h | 1 + > fs/ext4/ext4_jbd2.c | 3 ++- > fs/ext4/ialloc.c | 3 +++ > fs/ext4/inode.c | 20 ++++++++++++++++++++ > 4 files changed, 26 insertions(+), 1 deletion(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 5a20e9cd7184..2fad90c30493 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -2993,6 +2993,7 @@ int ext4_walk_page_buffers(handle_t *handle, > struct buffer_head *bh)); > int do_journal_get_write_access(handle_t *handle, struct inode *inode, > struct buffer_head *bh); > +bool ext4_should_enable_large_folio(struct inode *inode); > #define FALL_BACK_TO_NONDELALLOC 1 > #define CONVERT_INLINE_DATA 2 > > diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c > index 135e278c832e..b3e9b7bd7978 100644 > --- a/fs/ext4/ext4_jbd2.c > +++ b/fs/ext4/ext4_jbd2.c > @@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode) > ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || > test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || > (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && > - !test_opt(inode->i_sb, DELALLOC))) { > + !test_opt(inode->i_sb, DELALLOC) && > + !mapping_large_folio_support(inode->i_mapping))) { > /* We do not support data journalling for encrypted data */ > if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) > return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ > diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c > index e7ecc7c8a729..4938e78cbadc 100644 > --- a/fs/ext4/ialloc.c > +++ b/fs/ext4/ialloc.c > @@ -1336,6 +1336,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, > } > } > > + if (ext4_should_enable_large_folio(inode)) > + mapping_set_large_folios(inode->i_mapping); > + > ext4_update_inode_fsync_trans(handle, inode, 1); > > err = ext4_mark_inode_dirty(handle, inode); > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index 29eccdf8315a..7fd3921cfe46 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -4774,6 +4774,23 @@ static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, > return -EFSCORRUPTED; > } > > +bool ext4_should_enable_large_folio(struct inode *inode) > +{ > + struct super_block *sb = inode->i_sb; > + > + if (!S_ISREG(inode->i_mode)) > + return false; > + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || > + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) > + return false; > + if (ext4_has_feature_verity(sb)) > + return false; > + if (ext4_has_feature_encrypt(sb)) > + return false; > + > + return true; > +} > + > struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, > ext4_iget_flags flags, const char *function, > unsigned int line) > @@ -5096,6 +5113,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, > ret = -EFSCORRUPTED; > goto bad_inode; > } > + if (ext4_should_enable_large_folio(inode)) > + mapping_set_large_folios(inode->i_mapping); > + > ret = check_igot_inode(inode, flags, function, line); > /* > * -ESTALE here means there is nothing inherently wrong with the inode, > -- > 2.46.1 >
It looks like this failure requires using madvise() with MADV_HWPOISON (which requires root) and MADV_PAGEOUT, and the stack trace is in deep in the an mm codepath: madvise_cold_or_pageout_pte_range+0x1cac/0x2800 reclaim_pages+0x393/0x560 reclaim_folio_list+0xe2/0x4c0 shrink_folio_list+0x44f/0x3d90 unmap_poisoned_folio+0x130/0x500 try_to_unmap+0x12f/0x140 rmap_walk+0x16b/0x1f0 ... The bisected commit is the one which enables using large folios, so while it's possible that this due to ext4 doing something not quite right when using large folios, it's also posible that this might be a bug in the folio/mm code paths. Does this reproduce on other file systems, such as XFS? - Ted
On Wed, Jun 25, 2025 at 09:15:45AM -0400, Theodore Ts'o wrote: > It looks like this failure requires using madvise() with MADV_HWPOISON > (which requires root) and MADV_PAGEOUT, and the stack trace is in deep > in the an mm codepath: > > madvise_cold_or_pageout_pte_range+0x1cac/0x2800 > reclaim_pages+0x393/0x560 > reclaim_folio_list+0xe2/0x4c0 > shrink_folio_list+0x44f/0x3d90 > unmap_poisoned_folio+0x130/0x500 > try_to_unmap+0x12f/0x140 > rmap_walk+0x16b/0x1f0 > ... > > The bisected commit is the one which enables using large folios, so > while it's possible that this due to ext4 doing something not quite > right when using large folios, it's also posible that this might be a > bug in the folio/mm code paths. > > Does this reproduce on other file systems, such as XFS? > Indeed, this issue can also be reproduced on XFS file system. Thanks for the advice. I will conduct cross-filesystem validation next time when I encounter ext4 issue. [ 395.888267] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] SMP KASI [ 395.888767] KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] [ 395.889150] CPU: 2 UID: 0 PID: 7420 Comm: repro Not tainted 6.16.0-rc3-86731a2a651e #1 PREEMPT(voluntary) [ 395.889620] Hardware name: Red Hat KVM/RHEL, BIOS edk2-20241117-3.el9 11/17/2024 [ 395.889967] RIP: 0010:try_to_unmap_one+0x4ef/0x3860 [ 395.890230] Code: f5 a5 ff 48 8b 9d 78 ff ff ff 49 8d 46 18 48 89 85 70 fe ff ff 48 85 db 0f 84 96 1a 00 00 e8 c8 f58 [ 395.891081] RSP: 0018:ff1100011869ebc0 EFLAGS: 00010246 [ 395.891337] RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff81e1a1a1 [ 395.891676] RDX: ff11000130330000 RSI: ffffffff81e186c8 RDI: 0000000000000005 [ 395.892018] RBP: ff1100011869ed90 R08: 0000000000000001 R09: ffe21c00230d3d3b [ 395.892356] R10: 0000000000000000 R11: ff11000130330e58 R12: 0000000020e00000 [ 395.892691] R13: ffd40000043c8000 R14: ffd40000043c8000 R15: dffffc0000000000 [ 395.893043] FS: 00007fbd34523740(0000) GS:ff110004a4e62000(0000) knlGS:0000000000000000 [ 395.893437] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 395.893718] CR2: 0000000021000000 CR3: 000000010f8bf004 CR4: 0000000000771ef0 [ 395.894060] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 395.894398] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 [ 395.894732] PKRU: 55555554 [ 395.894868] Call Trace: [ 395.894991] <TASK> [ 395.895109] ? __pfx_try_to_unmap_one+0x10/0x10 [ 395.895337] __rmap_walk_file+0x2a5/0x4a0 [ 395.895538] rmap_walk+0x16b/0x1f0 [ 395.895706] try_to_unmap+0x12f/0x140 [ 395.895853] ? __pfx_try_to_unmap+0x10/0x10 [ 395.896061] ? __pfx_try_to_unmap_one+0x10/0x10 [ 395.896284] ? __pfx_folio_not_mapped+0x10/0x10 [ 395.896504] ? __pfx_folio_lock_anon_vma_read+0x10/0x10 [ 395.896758] ? __sanitizer_cov_trace_const_cmp8+0x1c/0x30 [ 395.897025] unmap_poisoned_folio+0x130/0x500 [ 395.897251] shrink_folio_list+0x44f/0x3d90 [ 395.897476] ? __pfx_shrink_folio_list+0x10/0x10 [ 395.897719] ? is_bpf_text_address+0x94/0x1b0 [ 395.897941] ? debug_smp_processor_id+0x20/0x30 [ 395.898172] ? is_bpf_text_address+0x9e/0x1b0 [ 395.898387] ? kernel_text_address+0xd3/0xe0 [ 395.898604] ? __kernel_text_address+0x16/0x50 [ 395.898827] ? unwind_get_return_address+0x65/0xb0 [ 395.899066] ? __pfx_stack_trace_consume_entry+0x10/0x10 [ 395.899326] ? arch_stack_walk+0xa1/0xf0 [ 395.899530] reclaim_folio_list+0xe2/0x4c0 [ 395.899733] ? check_path.constprop.0+0x28/0x50 [ 395.899963] ? __pfx_reclaim_folio_list+0x10/0x10 [ 395.900198] ? folio_isolate_lru+0x38c/0x590 [ 395.900412] reclaim_pages+0x393/0x560 [ 395.900606] ? __pfx_reclaim_pages+0x10/0x10 [ 395.900824] ? do_raw_spin_unlock+0x15c/0x210 [ 395.901044] madvise_cold_or_pageout_pte_range+0x1cac/0x2800 [ 395.901326] ? __pfx_madvise_cold_or_pageout_pte_range+0x10/0x10 [ 395.901631] ? lock_is_held_type+0xef/0x150 [ 395.901852] ? __pfx_madvise_cold_or_pageout_pte_range+0x10/0x10 [ 395.902158] walk_pgd_range+0xe2d/0x2420 [ 395.902373] ? __pfx_walk_pgd_range+0x10/0x10 [ 395.902593] __walk_page_range+0x177/0x810 [ 395.902799] ? find_vma+0xc4/0x140 [ 395.902977] ? __pfx_find_vma+0x10/0x10 [ 395.903176] ? __this_cpu_preempt_check+0x21/0x30 [ 395.903401] ? __sanitizer_cov_trace_const_cmp8+0x1c/0x30 [ 395.903667] walk_page_range_mm+0x39f/0x770 [ 395.903877] ? __pfx_walk_page_range_mm+0x10/0x10 [ 395.904109] ? __this_cpu_preempt_check+0x21/0x30 [ 395.904340] ? __sanitizer_cov_trace_const_cmp4+0x1a/0x20 [ 395.904606] ? mlock_drain_local+0x27f/0x4b0 [ 395.904826] walk_page_range+0x70/0xa0 [ 395.905013] ? __kasan_check_write+0x18/0x20 [ 395.905227] madvise_do_behavior+0x13e3/0x35f0 [ 395.905453] ? copy_vma_and_data+0x353/0x7d0 [ 395.905674] ? __pfx_madvise_do_behavior+0x10/0x10 [ 395.905922] ? __pfx_arch_get_unmapped_area_topdown+0x10/0x10 [ 395.906219] ? __this_cpu_preempt_check+0x21/0x30 [ 395.906455] ? lock_is_held_type+0xef/0x150 [ 395.906665] ? __lock_acquire+0x412/0x22a0 [ 395.906875] ? __this_cpu_preempt_check+0x21/0x30 [ 395.907105] ? lock_acquire+0x180/0x310 [ 395.907306] ? __pfx_down_read+0x10/0x10 [ 395.907503] ? __lock_acquire+0x412/0x22a0 [ 395.907707] ? __pfx___do_sys_mremap+0x10/0x10 [ 395.907929] ? __sanitizer_cov_trace_switch+0x58/0xa0 [ 395.908186] do_madvise+0x193/0x2b0 [ 395.908363] ? do_madvise+0x193/0x2b0 [ 395.908550] ? __pfx_do_madvise+0x10/0x10 [ 395.908801] ? __this_cpu_preempt_check+0x21/0x30 [ 395.909036] ? seqcount_lockdep_reader_access.constprop.0+0xb4/0xd0 [ 395.909335] ? lockdep_hardirqs_on+0x89/0x110 [ 395.909556] ? trace_hardirqs_on+0x51/0x60 [ 395.909763] ? seqcount_lockdep_reader_access.constprop.0+0xc0/0xd0 [ 395.910073] ? __sanitizer_cov_trace_cmp4+0x1a/0x20 [ 395.910332] ? ktime_get_coarse_real_ts64+0xad/0xf0 [ 395.910578] ? __audit_syscall_entry+0x39c/0x500 [ 395.910812] __x64_sys_madvise+0xb2/0x120 [ 395.911016] ? syscall_trace_enter+0x14d/0x280 [ 395.911240] x64_sys_call+0x19ac/0x2150 [ 395.911431] do_syscall_64+0x6d/0x2e0 [ 395.911619] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 395.911865] RIP: 0033:0x7fbd3430756d [ 395.912046] Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d8 [ 395.912905] RSP: 002b:00007ffe6486ec48 EFLAGS: 00000217 ORIG_RAX: 000000000000001c [ 395.913267] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fbd3430756d [ 395.913603] RDX: 0000000000000015 RSI: 0000000000c00000 RDI: 0000000020400000 [ 395.913941] RBP: 00007ffe6486ec60 R08: 00007ffe6486ec60 R09: 00007ffe6486ec60 [ 395.914280] R10: 0000000020fc6000 R11: 0000000000000217 R12: 00007ffe6486edb8 [ 395.914629] R13: 00000000004018e5 R14: 0000000000403e08 R15: 00007fbd3456a000 [ 395.914989] </TASK> [ 395.915111] Modules linked in: [ 395.915296] ---[ end trace 0000000000000000 ]--- FYI, there is ongoing discussion in terms of folio/mm domain - https://lore.kernel.org/all/20250611074643.250837-1-tujinjiang@huawei.com/T/ Regards, Yi Lai > - Ted
Hello Zhang Yi, On 5/12/2025 12:03 PM, Zhang Yi wrote: > From: Zhang Yi <yi.zhang@huawei.com> > > Besides fsverity, fscrypt, and the data=journal mode, ext4 now supports > large folios for regular files. Enable this feature by default. However, > since we cannot change the folio order limitation of mappings on active > inodes, setting the journal=data mode via ioctl on an active inode will > not take immediate effect in non-delalloc mode. > We run lmbench3 as part of our Weekly CI for the purpose of Kernel Performance Regression testing between a stable vs rc kernel. We noticed a regression on the kernels starting from 6.16-rc1 all the way through 6.16-rc3 in the range of 8-12%. Further bisection b/w 6.15 and 6.16-rc1 pointed me to the first bad commit as 7ac67301e82f02b77a5c8e7377a1f414ef108b84. The following were the machine configurations and test parameters used:- Model name: AMD EPYC 9754 128-Core Processor [Bergamo] Thread(s) per core: 2 Core(s) per socket: 128 Socket(s): 1 Total online memory: 258G micro-benchmark_variant: "lmbench3-development-1-0-MMAP-50%" which has the following parameters, -> nr_thread: 1 -> memory_size: 50% -> mode: development -> test: MMAP The following are the stats after bisection:- (the KPI used here is lmbench3.MMAP.read.latency.us) v6.15 - 97.3K v6.16-rc1 - 107.5K v6.16-rc3 - 107.4K 6.15.0-rc4badcommit - 103.5K 6.15.0-rc4badcommit_m1 (one commit before bad-commit) - 94.2K I also ran the micro-benchmark with tools/testing/perf record and following is the output from tools/testing/perf diff b/w the bad commit and just one commit before that. # ./perf diff perf.data.old perf.data No kallsyms or vmlinux with build-id da8042fb274c5e3524318e5e3afbeeef5df2055e was found # Event 'cycles:P' # # Baseline Delta Abs Shared Object Symbol > # ........ ......... ....................... ....................................................................................................................................................................................> # +4.34% [kernel.kallsyms] [k] __lruvec_stat_mod_folio +3.41% [kernel.kallsyms] [k] unmap_page_range +3.33% [kernel.kallsyms] [k] __mod_memcg_lruvec_state +2.04% [kernel.kallsyms] [k] srso_alias_return_thunk +2.02% [kernel.kallsyms] [k] srso_alias_safe_ret 22.22% -1.78% bw_mmap_rd [.] bread +1.76% [kernel.kallsyms] [k] __handle_mm_fault +1.70% [kernel.kallsyms] [k] filemap_map_pages +1.58% [kernel.kallsyms] [k] set_pte_range +1.58% [kernel.kallsyms] [k] next_uptodate_folio +1.33% [kernel.kallsyms] [k] do_anonymous_page +1.01% [kernel.kallsyms] [k] get_page_from_freelist +0.98% [kernel.kallsyms] [k] __mem_cgroup_charge +0.85% [kernel.kallsyms] [k] asm_exc_page_fault +0.82% [kernel.kallsyms] [k] native_irq_return_iret +0.82% [kernel.kallsyms] [k] do_user_addr_fault +0.77% [kernel.kallsyms] [k] clear_page_erms +0.75% [kernel.kallsyms] [k] handle_mm_fault +0.73% [kernel.kallsyms] [k] set_ptes.isra.0 +0.70% [kernel.kallsyms] [k] lru_add +0.69% [kernel.kallsyms] [k] folio_add_file_rmap_ptes +0.68% [kernel.kallsyms] [k] folio_remove_rmap_ptes 12.45% -0.65% line [.] mem_benchmark_0 +0.64% [kernel.kallsyms] [k] __alloc_frozen_pages_noprof +0.63% [kernel.kallsyms] [k] vm_normal_page +0.63% [kernel.kallsyms] [k] free_pages_and_swap_cache +0.63% [kernel.kallsyms] [k] lock_vma_under_rcu +0.60% [kernel.kallsyms] [k] __rcu_read_unlock +0.59% [kernel.kallsyms] [k] cgroup_rstat_updated +0.57% [kernel.kallsyms] [k] get_mem_cgroup_from_mm +0.52% [kernel.kallsyms] [k] __mod_lruvec_state +0.51% [kernel.kallsyms] [k] exc_page_fault > Signed-off-by: Zhang Yi <yi.zhang@huawei.com> > --- > fs/ext4/ext4.h | 1 + > fs/ext4/ext4_jbd2.c | 3 ++- > fs/ext4/ialloc.c | 3 +++ > fs/ext4/inode.c | 20 ++++++++++++++++++++ > 4 files changed, 26 insertions(+), 1 deletion(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 5a20e9cd7184..2fad90c30493 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -2993,6 +2993,7 @@ int ext4_walk_page_buffers(handle_t *handle, > struct buffer_head *bh)); > int do_journal_get_write_access(handle_t *handle, struct inode *inode, > struct buffer_head *bh); > +bool ext4_should_enable_large_folio(struct inode *inode); > #define FALL_BACK_TO_NONDELALLOC 1 > #define CONVERT_INLINE_DATA 2 > > diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c > index 135e278c832e..b3e9b7bd7978 100644 > --- a/fs/ext4/ext4_jbd2.c > +++ b/fs/ext4/ext4_jbd2.c > @@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode) > ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || > test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || > (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && > - !test_opt(inode->i_sb, DELALLOC))) { > + !test_opt(inode->i_sb, DELALLOC) && > + !mapping_large_folio_support(inode->i_mapping))) { > /* We do not support data journalling for encrypted data */ > if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) > return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ > diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c > index e7ecc7c8a729..4938e78cbadc 100644 > --- a/fs/ext4/ialloc.c > +++ b/fs/ext4/ialloc.c > @@ -1336,6 +1336,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, > } > } > > + if (ext4_should_enable_large_folio(inode)) > + mapping_set_large_folios(inode->i_mapping); > + > ext4_update_inode_fsync_trans(handle, inode, 1); > > err = ext4_mark_inode_dirty(handle, inode); > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index 29eccdf8315a..7fd3921cfe46 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -4774,6 +4774,23 @@ static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, > return -EFSCORRUPTED; > } > > +bool ext4_should_enable_large_folio(struct inode *inode) > +{ > + struct super_block *sb = inode->i_sb; > + > + if (!S_ISREG(inode->i_mode)) > + return false; > + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || > + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) > + return false; > + if (ext4_has_feature_verity(sb)) > + return false; > + if (ext4_has_feature_encrypt(sb)) > + return false; > + > + return true; > +} > + > struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, > ext4_iget_flags flags, const char *function, > unsigned int line) > @@ -5096,6 +5113,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, > ret = -EFSCORRUPTED; > goto bad_inode; > } > + if (ext4_should_enable_large_folio(inode)) > + mapping_set_large_folios(inode->i_mapping); > + > ret = check_igot_inode(inode, flags, function, line); > /* > * -ESTALE here means there is nothing inherently wrong with the inode, --- Thanks and Regards, Suneeth D Steps to run lmbench3 1. git clone https://github.com/intel/lmbench.git 2. git clone https://github.com/intel/lkp-tests.git 3. cd lmbench 4. git apply lkp-tests/programs/lmbench3/pkg/lmbench3.patch 5. make 6. sed -i '/lat_pagefault -P no/i [ -f no ] || dd if=/dev/zero of=no count=1 bs=1G' bin/x86_64-linux-gnu/lmbench 7. ( echo 1 echo 1 echo 10240 echo development echo no echo no echo no echo no echo no echo yes echo no echo no echo no echo no echo no echo no echo no echo no echo no echo no echo no echo yes echo echo echo [ 1 -eq 1 ] && echo echo no ) | make results 8. cd results/ && make
Hello Suneeth D! On 2025/6/26 19:29, D, Suneeth wrote: > > Hello Zhang Yi, > > On 5/12/2025 12:03 PM, Zhang Yi wrote: >> From: Zhang Yi <yi.zhang@huawei.com> >> >> Besides fsverity, fscrypt, and the data=journal mode, ext4 now supports >> large folios for regular files. Enable this feature by default. However, >> since we cannot change the folio order limitation of mappings on active >> inodes, setting the journal=data mode via ioctl on an active inode will >> not take immediate effect in non-delalloc mode. >> > > We run lmbench3 as part of our Weekly CI for the purpose of Kernel Performance Regression testing between a stable vs rc kernel. We noticed a regression on the kernels starting from 6.16-rc1 all the way through 6.16-rc3 in the range of 8-12%. Further bisection b/w 6.15 and 6.16-rc1 pointed me to the first bad commit as 7ac67301e82f02b77a5c8e7377a1f414ef108b84. The following were the machine configurations and test parameters used:- > > Model name: AMD EPYC 9754 128-Core Processor [Bergamo] > Thread(s) per core: 2 > Core(s) per socket: 128 > Socket(s): 1 > Total online memory: 258G > > micro-benchmark_variant: "lmbench3-development-1-0-MMAP-50%" which has the following parameters, > > -> nr_thread: 1 > -> memory_size: 50% > -> mode: development > -> test: MMAP > > The following are the stats after bisection:- > > (the KPI used here is lmbench3.MMAP.read.latency.us) > > v6.15 - 97.3K > > v6.16-rc1 - 107.5K > > v6.16-rc3 - 107.4K > > 6.15.0-rc4badcommit - 103.5K > > 6.15.0-rc4badcommit_m1 (one commit before bad-commit) - 94.2K Thanks for the report, I will try to reproduce this performance regression on my machine and find out what caused this regression. Thanks, Yi. > > I also ran the micro-benchmark with tools/testing/perf record and following is the output from tools/testing/perf diff b/w the bad commit and just one commit before that. > > # ./perf diff perf.data.old perf.data > No kallsyms or vmlinux with build-id da8042fb274c5e3524318e5e3afbeeef5df2055e was found > # Event 'cycles:P' > # > # Baseline Delta Abs Shared Object Symbol > > > > # ........ ......... ....................... ....................................................................................................................................................................................> > # > +4.34% [kernel.kallsyms] [k] __lruvec_stat_mod_folio > +3.41% [kernel.kallsyms] [k] unmap_page_range > +3.33% [kernel.kallsyms] [k] __mod_memcg_lruvec_state > +2.04% [kernel.kallsyms] [k] srso_alias_return_thunk > +2.02% [kernel.kallsyms] [k] srso_alias_safe_ret > 22.22% -1.78% bw_mmap_rd [.] bread > +1.76% [kernel.kallsyms] [k] __handle_mm_fault > +1.70% [kernel.kallsyms] [k] filemap_map_pages > +1.58% [kernel.kallsyms] [k] set_pte_range > +1.58% [kernel.kallsyms] [k] next_uptodate_folio > +1.33% [kernel.kallsyms] [k] do_anonymous_page > +1.01% [kernel.kallsyms] [k] get_page_from_freelist > +0.98% [kernel.kallsyms] [k] __mem_cgroup_charge > +0.85% [kernel.kallsyms] [k] asm_exc_page_fault > +0.82% [kernel.kallsyms] [k] native_irq_return_iret > +0.82% [kernel.kallsyms] [k] do_user_addr_fault > +0.77% [kernel.kallsyms] [k] clear_page_erms > +0.75% [kernel.kallsyms] [k] handle_mm_fault > +0.73% [kernel.kallsyms] [k] set_ptes.isra.0 > +0.70% [kernel.kallsyms] [k] lru_add > +0.69% [kernel.kallsyms] [k] folio_add_file_rmap_ptes > +0.68% [kernel.kallsyms] [k] folio_remove_rmap_ptes > 12.45% -0.65% line [.] mem_benchmark_0 > +0.64% [kernel.kallsyms] [k] __alloc_frozen_pages_noprof > +0.63% [kernel.kallsyms] [k] vm_normal_page > +0.63% [kernel.kallsyms] [k] free_pages_and_swap_cache > +0.63% [kernel.kallsyms] [k] lock_vma_under_rcu > +0.60% [kernel.kallsyms] [k] __rcu_read_unlock > +0.59% [kernel.kallsyms] [k] cgroup_rstat_updated > +0.57% [kernel.kallsyms] [k] get_mem_cgroup_from_mm > +0.52% [kernel.kallsyms] [k] __mod_lruvec_state > +0.51% [kernel.kallsyms] [k] exc_page_fault > >> Signed-off-by: Zhang Yi <yi.zhang@huawei.com> >> --- >> fs/ext4/ext4.h | 1 + >> fs/ext4/ext4_jbd2.c | 3 ++- >> fs/ext4/ialloc.c | 3 +++ >> fs/ext4/inode.c | 20 ++++++++++++++++++++ >> 4 files changed, 26 insertions(+), 1 deletion(-) >> >> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h >> index 5a20e9cd7184..2fad90c30493 100644 >> --- a/fs/ext4/ext4.h >> +++ b/fs/ext4/ext4.h >> @@ -2993,6 +2993,7 @@ int ext4_walk_page_buffers(handle_t *handle, >> struct buffer_head *bh)); >> int do_journal_get_write_access(handle_t *handle, struct inode *inode, >> struct buffer_head *bh); >> +bool ext4_should_enable_large_folio(struct inode *inode); >> #define FALL_BACK_TO_NONDELALLOC 1 >> #define CONVERT_INLINE_DATA 2 >> diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c >> index 135e278c832e..b3e9b7bd7978 100644 >> --- a/fs/ext4/ext4_jbd2.c >> +++ b/fs/ext4/ext4_jbd2.c >> @@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode) >> ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || >> test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || >> (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && >> - !test_opt(inode->i_sb, DELALLOC))) { >> + !test_opt(inode->i_sb, DELALLOC) && >> + !mapping_large_folio_support(inode->i_mapping))) { >> /* We do not support data journalling for encrypted data */ >> if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) >> return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ >> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c >> index e7ecc7c8a729..4938e78cbadc 100644 >> --- a/fs/ext4/ialloc.c >> +++ b/fs/ext4/ialloc.c >> @@ -1336,6 +1336,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, >> } >> } >> + if (ext4_should_enable_large_folio(inode)) >> + mapping_set_large_folios(inode->i_mapping); >> + >> ext4_update_inode_fsync_trans(handle, inode, 1); >> err = ext4_mark_inode_dirty(handle, inode); >> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c >> index 29eccdf8315a..7fd3921cfe46 100644 >> --- a/fs/ext4/inode.c >> +++ b/fs/ext4/inode.c >> @@ -4774,6 +4774,23 @@ static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, >> return -EFSCORRUPTED; >> } >> +bool ext4_should_enable_large_folio(struct inode *inode) >> +{ >> + struct super_block *sb = inode->i_sb; >> + >> + if (!S_ISREG(inode->i_mode)) >> + return false; >> + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || >> + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) >> + return false; >> + if (ext4_has_feature_verity(sb)) >> + return false; >> + if (ext4_has_feature_encrypt(sb)) >> + return false; >> + >> + return true; >> +} >> + >> struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, >> ext4_iget_flags flags, const char *function, >> unsigned int line) >> @@ -5096,6 +5113,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, >> ret = -EFSCORRUPTED; >> goto bad_inode; >> } >> + if (ext4_should_enable_large_folio(inode)) >> + mapping_set_large_folios(inode->i_mapping); >> + >> ret = check_igot_inode(inode, flags, function, line); >> /* >> * -ESTALE here means there is nothing inherently wrong with the inode, > > --- > Thanks and Regards, > Suneeth D
On Thu, Jun 26, 2025 at 09:26:41PM +0800, Zhang Yi wrote: > > Thanks for the report, I will try to reproduce this performance regression on > my machine and find out what caused this regression. I took a quick look at this, and I *think* it's because lmbench is measuring the latency of mmap read's --- I'm going to guess 4k random page faults, but I'm not sure. If that's the case, this may just be a natural result of using large folios, and the tradeoff of optimizing for large reads versus small page faults. But if you could take a closer look, that would be great, thanks! - Ted
On 2025/6/26 22:56, Theodore Ts'o wrote: > On Thu, Jun 26, 2025 at 09:26:41PM +0800, Zhang Yi wrote: >> >> Thanks for the report, I will try to reproduce this performance regression on >> my machine and find out what caused this regression. > > I took a quick look at this, and I *think* it's because lmbench is > measuring the latency of mmap read's --- I'm going to guess 4k random > page faults, but I'm not sure. If that's the case, this may just be a > natural result of using large folios, and the tradeoff of optimizing > for large reads versus small page faults. > > But if you could take a closer look, that would be great, thanks! > After analyzing what the lmbench mmap test actually does, I found that the regression is related to the mmap writes, not mmap reads. In other words, the latency increases in ext4_page_mkwrite() after we enable large folios. The lmbench mmap test performed the following two tests: 1. mmap a range with PROT_READ|PROT_WRITE and MAP_SHARED, and then write one byte every 16KB sequentially. 2. mmap a range with PROT_READ and MAP_SHARED, and then read byte one by one sequentially. For the mmap read test, the average page fault latency on my machine can be improved from 3,634 ns to 2,005 ns. This improvement is due to the ability to save the folio readahead loop in page_cache_async_ra() and the set PTE loop in filemap_map_pages() after implementing support for large folios. For the mmap write test, the number of page faults does not decrease due to the large folio (the maximum order is 5), each page still incurs one page fault. However, the ext4_page_mkwrite() does multiple iterations through buffer_head in the folio, so the time consumption will increase. The latency of ext4_page_mkwrite() can be increased from 958ns to 1596ns. After looking at the comments in finish_fault() and 43e027e414232 ("mm: memory: extend finish_fault() to support large folio"). vm_fault_t finish_fault(struct vm_fault *vmf) { ... nr_pages = folio_nr_pages(folio); /* * Using per-page fault to maintain the uffd semantics, and same * approach also applies to non-anonymous-shmem faults to avoid * inflating the RSS of the process. */ if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) { nr_pages = 1; ... set_pte_range(vmf, folio, page, nr_pages, addr); } I believe this regression can be resolved if the finish_fault() supports file-based large folios, but I'm not sure if we are planning to implement this. As for ext4_page_mkwrite(), I think it can also be optimized by reducing the number of the folio iterations, but this would make it impossible to use existing generic helpers and could make the code very messy. Best regards, Yi.
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5a20e9cd7184..2fad90c30493 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2993,6 +2993,7 @@ int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *bh)); int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); +bool ext4_should_enable_large_folio(struct inode *inode); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 135e278c832e..b3e9b7bd7978 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode) ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC))) { + !test_opt(inode->i_sb, DELALLOC) && + !mapping_large_folio_support(inode->i_mapping))) { /* We do not support data journalling for encrypted data */ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index e7ecc7c8a729..4938e78cbadc 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1336,6 +1336,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, } } + if (ext4_should_enable_large_folio(inode)) + mapping_set_large_folios(inode->i_mapping); + ext4_update_inode_fsync_trans(handle, inode, 1); err = ext4_mark_inode_dirty(handle, inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 29eccdf8315a..7fd3921cfe46 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4774,6 +4774,23 @@ static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, return -EFSCORRUPTED; } +bool ext4_should_enable_large_folio(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!S_ISREG(inode->i_mode)) + return false; + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + return false; + if (ext4_has_feature_verity(sb)) + return false; + if (ext4_has_feature_encrypt(sb)) + return false; + + return true; +} + struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) @@ -5096,6 +5113,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } + if (ext4_should_enable_large_folio(inode)) + mapping_set_large_folios(inode->i_mapping); + ret = check_igot_inode(inode, flags, function, line); /* * -ESTALE here means there is nothing inherently wrong with the inode,