diff mbox series

[v2,8/8] ext4: enable large folio for regular file

Message ID 20250512063319.3539411-9-yi.zhang@huaweicloud.com
State Awaiting Upstream
Headers show
Series ext4: enable large folio for regular files | expand

Commit Message

Zhang Yi May 12, 2025, 6:33 a.m. UTC
From: Zhang Yi <yi.zhang@huawei.com>

Besides fsverity, fscrypt, and the data=journal mode, ext4 now supports
large folios for regular files. Enable this feature by default. However,
since we cannot change the folio order limitation of mappings on active
inodes, setting the journal=data mode via ioctl on an active inode will
not take immediate effect in non-delalloc mode.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/ext4.h      |  1 +
 fs/ext4/ext4_jbd2.c |  3 ++-
 fs/ext4/ialloc.c    |  3 +++
 fs/ext4/inode.c     | 20 ++++++++++++++++++++
 4 files changed, 26 insertions(+), 1 deletion(-)

Comments

kernel test robot May 16, 2025, 9:05 a.m. UTC | #1
Hello,

kernel test robot noticed a 37.7% improvement of fsmark.files_per_sec on:


commit: 34696dd792d839c46a280c720ab28aab2db1f4bf ("[PATCH v2 8/8] ext4: enable large folio for regular file")
url: https://github.com/intel-lab-lkp/linux/commits/Zhang-Yi/ext4-make-ext4_mpage_readpages-support-large-folios/20250512-144942
base: https://git.kernel.org/cgit/linux/kernel/git/tytso/ext4.git dev
patch link: https://lore.kernel.org/all/20250512063319.3539411-9-yi.zhang@huaweicloud.com/
patch subject: [PATCH v2 8/8] ext4: enable large folio for regular file

testcase: fsmark
config: x86_64-rhel-9.4
compiler: gcc-12
test machine: 96 threads 2 sockets Intel(R) Xeon(R) Platinum 8260L CPU @ 2.40GHz (Cascade Lake) with 128G memory
parameters:

	iterations: 1x
	nr_threads: 1t
	disk: 1BRD_48G
	fs: ext4
	filesize: 4M
	test_size: 24G
	sync_method: NoSync
	cpufreq_governor: performance



Details are as below:
-------------------------------------------------------------------------------------------------->


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20250516/202505161418.ec0d753f-lkp@intel.com

=========================================================================================
compiler/cpufreq_governor/disk/filesize/fs/iterations/kconfig/nr_threads/rootfs/sync_method/tbox_group/test_size/testcase:
  gcc-12/performance/1BRD_48G/4M/ext4/1x/x86_64-rhel-9.4/1t/debian-12-x86_64-20240206.cgz/NoSync/lkp-csl-2sp3/24G/fsmark

commit: 
  0368e6caf2 ("ext4: make online defragmentation support large folios")
  34696dd792 ("ext4: enable large folio for regular file")

0368e6caf2d6ff21 34696dd792d839c46a280c720ab 
---------------- --------------------------- 
         %stddev     %change         %stddev
             \          |                \  
 1.964e+09 ±  3%     -14.9%  1.671e+09 ±  8%  cpuidle..time
      3825 ± 20%     -24.6%       2884 ± 17%  sched_debug.cpu.avg_idle.min
     69081            -3.2%      66894        fsmark.app_overhead
    529.15           +37.7%     728.75        fsmark.files_per_sec
     70.33 ±  3%     -11.6%      62.17        fsmark.time.percent_of_cpu_this_job_got
      3.50 ± 54%    +109.5%       7.33 ± 21%  perf-sched.wait_and_delay.count.schedule_hrtimeout_range.ep_poll.do_epoll_wait.__x64_sys_epoll_wait
      4.00 ± 14%     +21.0%       4.84 ± 21%  perf-sched.wait_and_delay.max.ms.schedule_timeout.rcu_gp_fqs_loop.rcu_gp_kthread.kthread
      0.91 ± 12%     -14.1%       0.78 ±  8%  perf-sched.wait_time.max.ms.do_wait.kernel_wait4.do_syscall_64.entry_SYSCALL_64_after_hwframe
    691923 ± 25%     -62.0%     263011 ± 18%  proc-vmstat.numa_foreign
  12650103 ±  4%     -35.5%    8153366        proc-vmstat.numa_hit
  12607229 ±  3%     -36.1%    8052736        proc-vmstat.numa_local
    707756 ± 22%     -62.8%     263011 ± 18%  proc-vmstat.numa_miss
    806083 ± 25%     -55.0%     362335 ± 13%  proc-vmstat.numa_other
    231973            -3.4%     224103        proc-vmstat.pgfault
      3.86 ±  2%     +37.3%       5.29 ±  9%  perf-stat.i.MPKI
 1.889e+09           -11.1%  1.679e+09 ±  2%  perf-stat.i.branch-instructions
      3.60 ±  3%      +0.3        3.91 ±  5%  perf-stat.i.branch-miss-rate%
  34677646           +17.9%   40883980 ±  8%  perf-stat.i.cache-misses
      2003 ±  2%     +10.2%       2209 ±  4%  perf-stat.i.context-switches
      1.08           +14.4%       1.24 ±  4%  perf-stat.i.cpi
 9.762e+09           -10.6%  8.728e+09 ±  2%  perf-stat.i.instructions
      0.98            -9.7%       0.89 ±  4%  perf-stat.i.ipc
      7068 ±  3%     +14.7%       8105 ±  6%  perf-stat.i.minor-faults
      7068 ±  3%     +14.7%       8105 ±  6%  perf-stat.i.page-faults
      3.56           +31.5%       4.68 ±  6%  perf-stat.overall.MPKI
      3.67 ±  2%      +0.4        4.06 ±  2%  perf-stat.overall.branch-miss-rate%
      1.04           +10.9%       1.15 ±  2%  perf-stat.overall.cpi
    291.68           -15.5%     246.52 ±  4%  perf-stat.overall.cycles-between-cache-misses
      0.96            -9.8%       0.87 ±  2%  perf-stat.overall.ipc
 1.795e+09           -12.0%   1.58e+09        perf-stat.ps.branch-instructions
  65831109            -2.5%   64181687        perf-stat.ps.branch-misses
  32991347           +16.6%   38460099 ±  7%  perf-stat.ps.cache-misses
      1905 ±  2%      +9.0%       2077 ±  4%  perf-stat.ps.context-switches
  9.28e+09           -11.5%  8.215e+09 ±  2%  perf-stat.ps.instructions
      6695 ±  2%     +13.6%       7604 ±  5%  perf-stat.ps.minor-faults
      6696 ±  2%     +13.6%       7604 ±  5%  perf-stat.ps.page-faults
 1.918e+11 ±  2%     -25.3%  1.432e+11 ±  7%  perf-stat.total.instructions
     24.59 ±  5%      -7.7       16.86        perf-profile.calltrace.cycles-pp.ext4_buffered_write_iter.vfs_write.ksys_write.do_syscall_64.entry_SYSCALL_64_after_hwframe
     24.39 ±  5%      -7.7       16.68        perf-profile.calltrace.cycles-pp.generic_perform_write.ext4_buffered_write_iter.vfs_write.ksys_write.do_syscall_64
     30.60            -6.3       24.26        perf-profile.calltrace.cycles-pp.write
     29.63            -6.3       23.36        perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_after_hwframe.write
     29.39            -6.3       23.12        perf-profile.calltrace.cycles-pp.vfs_write.ksys_write.do_syscall_64.entry_SYSCALL_64_after_hwframe.write
     29.60            -6.3       23.34        perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe.write
     29.50            -6.3       23.24        perf-profile.calltrace.cycles-pp.ksys_write.do_syscall_64.entry_SYSCALL_64_after_hwframe.write
      6.31 ±  3%      -5.1        1.24 ±  6%  perf-profile.calltrace.cycles-pp.ext4_da_do_write_end.generic_perform_write.ext4_buffered_write_iter.vfs_write.ksys_write
      5.84 ±  3%      -4.7        1.17 ±  6%  perf-profile.calltrace.cycles-pp.block_write_end.ext4_da_do_write_end.generic_perform_write.ext4_buffered_write_iter.vfs_write
      5.79 ±  3%      -4.6        1.16 ±  6%  perf-profile.calltrace.cycles-pp.__block_commit_write.block_write_end.ext4_da_do_write_end.generic_perform_write.ext4_buffered_write_iter
      9.66 ±  5%      -4.0        5.62 ±  2%  perf-profile.calltrace.cycles-pp.ext4_da_write_begin.generic_perform_write.ext4_buffered_write_iter.vfs_write.ksys_write
     17.24 ±  5%      -3.7       13.49        perf-profile.calltrace.cycles-pp.__writeback_single_inode.writeback_sb_inodes.__writeback_inodes_wb.wb_writeback.wb_do_writeback
     17.24 ±  5%      -3.7       13.49        perf-profile.calltrace.cycles-pp.do_writepages.__writeback_single_inode.writeback_sb_inodes.__writeback_inodes_wb.wb_writeback
     17.24 ±  5%      -3.7       13.50        perf-profile.calltrace.cycles-pp.__writeback_inodes_wb.wb_writeback.wb_do_writeback.wb_workfn.process_one_work
     17.24 ±  5%      -3.7       13.50        perf-profile.calltrace.cycles-pp.writeback_sb_inodes.__writeback_inodes_wb.wb_writeback.wb_do_writeback.wb_workfn
     17.24 ±  5%      -3.7       13.49        perf-profile.calltrace.cycles-pp.ext4_writepages.do_writepages.__writeback_single_inode.writeback_sb_inodes.__writeback_inodes_wb
      4.92 ±  6%      -3.1        1.87 ±  2%  perf-profile.calltrace.cycles-pp.__filemap_get_folio.ext4_da_write_begin.generic_perform_write.ext4_buffered_write_iter.vfs_write
      2.60 ±  5%      -1.7        0.94 ±  4%  perf-profile.calltrace.cycles-pp.filemap_add_folio.__filemap_get_folio.ext4_da_write_begin.generic_perform_write.ext4_buffered_write_iter
      1.78 ±  8%      -1.1        0.68 ±  7%  perf-profile.calltrace.cycles-pp.alloc_pages_mpol.folio_alloc_noprof.__filemap_get_folio.ext4_da_write_begin.generic_perform_write
      1.56 ±  8%      -1.1        0.48 ± 44%  perf-profile.calltrace.cycles-pp.mpage_submit_folio.mpage_map_and_submit_buffers.mpage_map_and_submit_extent.ext4_do_writepages.ext4_writepages
      1.86 ±  7%      -1.0        0.82 ±  6%  perf-profile.calltrace.cycles-pp.folio_alloc_noprof.__filemap_get_folio.ext4_da_write_begin.generic_perform_write.ext4_buffered_write_iter
      1.65 ±  7%      -1.0        0.64 ±  7%  perf-profile.calltrace.cycles-pp.__alloc_frozen_pages_noprof.alloc_pages_mpol.folio_alloc_noprof.__filemap_get_folio.ext4_da_write_begin
      4.54 ±  5%      -0.9        3.67 ±  3%  perf-profile.calltrace.cycles-pp.ext4_block_write_begin.ext4_da_write_begin.generic_perform_write.ext4_buffered_write_iter.vfs_write
      1.93 ±  6%      -0.8        1.08 ± 32%  perf-profile.calltrace.cycles-pp.ext4_finish_bio.ext4_release_io_end.ext4_end_io_end.ext4_do_writepages.ext4_writepages
      1.93 ±  6%      -0.8        1.09 ± 31%  perf-profile.calltrace.cycles-pp.ext4_release_io_end.ext4_end_io_end.ext4_do_writepages.ext4_writepages.do_writepages
      1.42 ±  7%      -0.8        0.58 ±  9%  perf-profile.calltrace.cycles-pp.get_page_from_freelist.__alloc_frozen_pages_noprof.alloc_pages_mpol.folio_alloc_noprof.__filemap_get_folio
      1.96 ±  6%      -0.8        1.20 ± 26%  perf-profile.calltrace.cycles-pp.ext4_end_io_end.ext4_do_writepages.ext4_writepages.do_writepages.__writeback_single_inode
      1.31 ±  3%      -0.7        0.58 ±  6%  perf-profile.calltrace.cycles-pp.mark_buffer_dirty.__block_commit_write.block_write_end.ext4_da_do_write_end.generic_perform_write
      2.04 ±  9%      -0.7        1.35 ±  7%  perf-profile.calltrace.cycles-pp.mpage_map_and_submit_buffers.mpage_map_and_submit_extent.ext4_do_writepages.ext4_writepages.do_writepages
      2.11 ±  9%      -0.6        1.46 ±  7%  perf-profile.calltrace.cycles-pp.mpage_map_and_submit_extent.ext4_do_writepages.ext4_writepages.do_writepages.__writeback_single_inode
      1.84 ±  7%      -0.5        1.30 ±  7%  perf-profile.calltrace.cycles-pp.create_empty_buffers.ext4_block_write_begin.ext4_da_write_begin.generic_perform_write.ext4_buffered_write_iter
      1.38 ±  8%      -0.2        1.18 ±  7%  perf-profile.calltrace.cycles-pp.folio_alloc_buffers.create_empty_buffers.ext4_block_write_begin.ext4_da_write_begin.generic_perform_write
      0.62 ±  2%      -0.1        0.56 ±  5%  perf-profile.calltrace.cycles-pp.ext4_es_lookup_extent.ext4_da_map_blocks.ext4_da_get_block_prep.ext4_block_write_begin.ext4_da_write_begin
      0.40 ± 70%      +0.3        0.67 ±  5%  perf-profile.calltrace.cycles-pp.io_serial_out.serial8250_console_write.console_flush_all.console_unlock.vprintk_emit
      1.34 ±  9%      +0.6        1.96 ±  4%  perf-profile.calltrace.cycles-pp.clear_page_erms.get_page_from_freelist.__alloc_frozen_pages_noprof.alloc_pages_mpol.alloc_pages_noprof
      0.34 ±103%      +0.8        1.13 ± 25%  perf-profile.calltrace.cycles-pp.get_jiffies_update.tmigr_requires_handle_remote.update_process_times.tick_nohz_handler.__hrtimer_run_queues
      1.84 ± 12%      +0.8        2.62 ± 10%  perf-profile.calltrace.cycles-pp.update_process_times.tick_nohz_handler.__hrtimer_run_queues.hrtimer_interrupt.__sysvec_apic_timer_interrupt
      0.35 ±103%      +0.8        1.15 ± 24%  perf-profile.calltrace.cycles-pp.tmigr_requires_handle_remote.update_process_times.tick_nohz_handler.__hrtimer_run_queues.hrtimer_interrupt
      4.14 ±  8%      +0.9        5.02 ±  9%  perf-profile.calltrace.cycles-pp.__sysvec_apic_timer_interrupt.sysvec_apic_timer_interrupt.asm_sysvec_apic_timer_interrupt.cpuidle_enter_state.cpuidle_enter
      4.11 ±  8%      +0.9        5.00 ±  9%  perf-profile.calltrace.cycles-pp.hrtimer_interrupt.__sysvec_apic_timer_interrupt.sysvec_apic_timer_interrupt.asm_sysvec_apic_timer_interrupt.cpuidle_enter_state
      1.95 ±  6%      +0.9        2.89 ±  3%  perf-profile.calltrace.cycles-pp.get_page_from_freelist.__alloc_frozen_pages_noprof.alloc_pages_mpol.alloc_pages_noprof.brd_insert_page
      2.35 ± 10%      +1.0        3.32 ± 12%  perf-profile.calltrace.cycles-pp.tick_nohz_handler.__hrtimer_run_queues.hrtimer_interrupt.__sysvec_apic_timer_interrupt.sysvec_apic_timer_interrupt
      3.21 ±  9%      +1.0        4.21 ± 10%  perf-profile.calltrace.cycles-pp.__hrtimer_run_queues.hrtimer_interrupt.__sysvec_apic_timer_interrupt.sysvec_apic_timer_interrupt.asm_sysvec_apic_timer_interrupt
      2.10 ±  6%      +1.0        3.12 ±  2%  perf-profile.calltrace.cycles-pp.__alloc_frozen_pages_noprof.alloc_pages_mpol.alloc_pages_noprof.brd_insert_page.brd_submit_bio
      2.21 ±  5%      +1.1        3.30 ±  3%  perf-profile.calltrace.cycles-pp.alloc_pages_mpol.alloc_pages_noprof.brd_insert_page.brd_submit_bio.__submit_bio
      2.26 ±  5%      +1.1        3.37 ±  3%  perf-profile.calltrace.cycles-pp.alloc_pages_noprof.brd_insert_page.brd_submit_bio.__submit_bio.__submit_bio_noacct
      3.70 ± 30%      +1.2        4.90 ±  4%  perf-profile.calltrace.cycles-pp.wait_for_lsr.serial8250_console_write.console_flush_all.console_unlock.vprintk_emit
      4.40 ± 29%      +1.4        5.82 ±  4%  perf-profile.calltrace.cycles-pp.serial8250_console_write.console_flush_all.console_unlock.vprintk_emit.devkmsg_emit
      4.96 ± 32%      +1.4        6.41        perf-profile.calltrace.cycles-pp.memcpy_toio.drm_fb_memcpy.ast_primary_plane_helper_atomic_update.drm_atomic_helper_commit_planes.drm_atomic_helper_commit_tail
      4.68 ± 30%      +1.5        6.17 ±  4%  perf-profile.calltrace.cycles-pp.devkmsg_write.vfs_write.ksys_write.do_syscall_64.entry_SYSCALL_64_after_hwframe
      4.68 ± 30%      +1.5        6.17 ±  4%  perf-profile.calltrace.cycles-pp.devkmsg_emit.devkmsg_write.vfs_write.ksys_write.do_syscall_64
      4.68 ± 30%      +1.5        6.17 ±  4%  perf-profile.calltrace.cycles-pp.vprintk_emit.devkmsg_emit.devkmsg_write.vfs_write.ksys_write
      4.68 ± 30%      +1.5        6.17 ±  4%  perf-profile.calltrace.cycles-pp.console_unlock.vprintk_emit.devkmsg_emit.devkmsg_write.vfs_write
      4.68 ± 30%      +1.5        6.17 ±  4%  perf-profile.calltrace.cycles-pp.console_flush_all.console_unlock.vprintk_emit.devkmsg_emit.devkmsg_write
      5.04 ± 32%      +1.5        6.54        perf-profile.calltrace.cycles-pp.drm_fb_memcpy.ast_primary_plane_helper_atomic_update.drm_atomic_helper_commit_planes.drm_atomic_helper_commit_tail.ast_mode_config_helper_atomic_commit_tail
      5.06 ± 32%      +1.5        6.59        perf-profile.calltrace.cycles-pp.ast_primary_plane_helper_atomic_update.drm_atomic_helper_commit_planes.drm_atomic_helper_commit_tail.ast_mode_config_helper_atomic_commit_tail.commit_tail
      5.07 ± 32%      +1.5        6.60        perf-profile.calltrace.cycles-pp.drm_atomic_commit.drm_atomic_helper_dirtyfb.drm_fbdev_shmem_helper_fb_dirty.drm_fb_helper_damage_work.process_one_work
      5.07 ± 32%      +1.5        6.59        perf-profile.calltrace.cycles-pp.drm_atomic_helper_commit_planes.drm_atomic_helper_commit_tail.ast_mode_config_helper_atomic_commit_tail.commit_tail.drm_atomic_helper_commit
      5.07 ± 32%      +1.5        6.60        perf-profile.calltrace.cycles-pp.drm_atomic_helper_commit_tail.ast_mode_config_helper_atomic_commit_tail.commit_tail.drm_atomic_helper_commit.drm_atomic_commit
      5.07 ± 32%      +1.5        6.60        perf-profile.calltrace.cycles-pp.ast_mode_config_helper_atomic_commit_tail.commit_tail.drm_atomic_helper_commit.drm_atomic_commit.drm_atomic_helper_dirtyfb
      5.07 ± 32%      +1.5        6.60        perf-profile.calltrace.cycles-pp.commit_tail.drm_atomic_helper_commit.drm_atomic_commit.drm_atomic_helper_dirtyfb.drm_fbdev_shmem_helper_fb_dirty
      5.07 ± 32%      +1.5        6.60        perf-profile.calltrace.cycles-pp.drm_atomic_helper_commit.drm_atomic_commit.drm_atomic_helper_dirtyfb.drm_fbdev_shmem_helper_fb_dirty.drm_fb_helper_damage_work
      5.07 ± 32%      +1.5        6.60        perf-profile.calltrace.cycles-pp.drm_atomic_helper_dirtyfb.drm_fbdev_shmem_helper_fb_dirty.drm_fb_helper_damage_work.process_one_work.worker_thread
      5.07 ± 32%      +1.5        6.60        perf-profile.calltrace.cycles-pp.drm_fb_helper_damage_work.process_one_work.worker_thread.kthread.ret_from_fork
      5.07 ± 32%      +1.5        6.60        perf-profile.calltrace.cycles-pp.drm_fbdev_shmem_helper_fb_dirty.drm_fb_helper_damage_work.process_one_work.worker_thread.kthread
      7.23 ±  9%      +1.9        9.10 ±  2%  perf-profile.calltrace.cycles-pp.rep_movs_alternative.copy_page_from_iter_atomic.generic_perform_write.ext4_buffered_write_iter.vfs_write
      7.48 ±  9%      +1.9        9.39 ±  2%  perf-profile.calltrace.cycles-pp.copy_page_from_iter_atomic.generic_perform_write.ext4_buffered_write_iter.vfs_write.ksys_write
      3.78 ±  7%      +2.2        5.96 ±  7%  perf-profile.calltrace.cycles-pp.memcpy_orig.copy_to_brd.brd_submit_bio.__submit_bio.__submit_bio_noacct
      4.02 ±  7%      +2.3        6.28 ±  6%  perf-profile.calltrace.cycles-pp.copy_to_brd.brd_submit_bio.__submit_bio.__submit_bio_noacct.ext4_io_submit
      5.63 ±  8%      +3.2        8.81 ±  4%  perf-profile.calltrace.cycles-pp._raw_spin_lock.brd_insert_page.brd_submit_bio.__submit_bio.__submit_bio_noacct
      8.48 ±  7%      +4.4       12.91 ±  3%  perf-profile.calltrace.cycles-pp.brd_insert_page.brd_submit_bio.__submit_bio.__submit_bio_noacct.ext4_io_submit
     17.80 ±  4%      +5.1       22.94        perf-profile.calltrace.cycles-pp.wb_do_writeback.wb_workfn.process_one_work.worker_thread.kthread
     17.80 ±  4%      +5.1       22.94        perf-profile.calltrace.cycles-pp.wb_workfn.process_one_work.worker_thread.kthread.ret_from_fork
     17.80 ±  4%      +5.1       22.94        perf-profile.calltrace.cycles-pp.wb_writeback.wb_do_writeback.wb_workfn.process_one_work.worker_thread
     17.72 ±  4%      +5.2       22.93        perf-profile.calltrace.cycles-pp.ext4_do_writepages.ext4_writepages.do_writepages.__writeback_single_inode.writeback_sb_inodes
     12.77 ±  5%      +6.6       19.36        perf-profile.calltrace.cycles-pp.brd_submit_bio.__submit_bio.__submit_bio_noacct.ext4_io_submit.ext4_do_writepages
     12.77 ±  5%      +6.6       19.37        perf-profile.calltrace.cycles-pp.__submit_bio.__submit_bio_noacct.ext4_io_submit.ext4_do_writepages.ext4_writepages
     12.77 ±  5%      +6.6       19.37        perf-profile.calltrace.cycles-pp.__submit_bio_noacct.ext4_io_submit.ext4_do_writepages.ext4_writepages.do_writepages
     12.77 ±  5%      +6.6       19.37        perf-profile.calltrace.cycles-pp.ext4_io_submit.ext4_do_writepages.ext4_writepages.do_writepages.__writeback_single_inode
     23.05 ±  4%      +6.7       29.71        perf-profile.calltrace.cycles-pp.kthread.ret_from_fork.ret_from_fork_asm
     22.96 ±  4%      +6.7       29.61        perf-profile.calltrace.cycles-pp.process_one_work.worker_thread.kthread.ret_from_fork.ret_from_fork_asm
     23.05 ±  4%      +6.7       29.71        perf-profile.calltrace.cycles-pp.ret_from_fork.ret_from_fork_asm
     23.05 ±  4%      +6.7       29.71        perf-profile.calltrace.cycles-pp.ret_from_fork_asm
     22.98 ±  4%      +6.7       29.64        perf-profile.calltrace.cycles-pp.worker_thread.kthread.ret_from_fork.ret_from_fork_asm
      0.48 ±110%      +9.0        9.44 ±  2%  perf-profile.calltrace.cycles-pp.ext4_writepages.do_writepages.__writeback_single_inode.writeback_sb_inodes.wb_writeback
      0.48 ±110%      +9.0        9.44 ±  2%  perf-profile.calltrace.cycles-pp.__writeback_single_inode.writeback_sb_inodes.wb_writeback.wb_do_writeback.wb_workfn
      0.48 ±110%      +9.0        9.44 ±  2%  perf-profile.calltrace.cycles-pp.do_writepages.__writeback_single_inode.writeback_sb_inodes.wb_writeback.wb_do_writeback
      0.48 ±110%      +9.0        9.44 ±  2%  perf-profile.calltrace.cycles-pp.writeback_sb_inodes.wb_writeback.wb_do_writeback.wb_workfn.process_one_work
     24.59 ±  5%      -7.7       16.86        perf-profile.children.cycles-pp.ext4_buffered_write_iter
     24.41 ±  5%      -7.7       16.69        perf-profile.children.cycles-pp.generic_perform_write
     31.13            -6.6       24.56 ±  2%  perf-profile.children.cycles-pp.entry_SYSCALL_64_after_hwframe
     31.10            -6.6       24.53 ±  2%  perf-profile.children.cycles-pp.do_syscall_64
     30.67            -6.4       24.32        perf-profile.children.cycles-pp.write
     29.40            -6.3       23.14        perf-profile.children.cycles-pp.vfs_write
     29.51            -6.3       23.26        perf-profile.children.cycles-pp.ksys_write
      6.32 ±  3%      -5.1        1.24 ±  6%  perf-profile.children.cycles-pp.ext4_da_do_write_end
      5.86 ±  3%      -4.7        1.17 ±  6%  perf-profile.children.cycles-pp.block_write_end
      5.80 ±  3%      -4.6        1.16 ±  6%  perf-profile.children.cycles-pp.__block_commit_write
      9.67 ±  5%      -4.0        5.62 ±  2%  perf-profile.children.cycles-pp.ext4_da_write_begin
     17.24 ±  5%      -3.7       13.50        perf-profile.children.cycles-pp.__writeback_inodes_wb
      4.94 ±  6%      -3.1        1.88 ±  2%  perf-profile.children.cycles-pp.__filemap_get_folio
      2.62 ±  5%      -1.7        0.95 ±  4%  perf-profile.children.cycles-pp.filemap_add_folio
      1.86 ±  7%      -1.0        0.82 ±  6%  perf-profile.children.cycles-pp.folio_alloc_noprof
      4.54 ±  5%      -0.9        3.68 ±  3%  perf-profile.children.cycles-pp.ext4_block_write_begin
      2.11 ±  8%      -0.8        1.35 ±  7%  perf-profile.children.cycles-pp.mpage_map_and_submit_buffers
      1.15 ±  6%      -0.7        0.41 ± 11%  perf-profile.children.cycles-pp.__folio_batch_add_and_move
      1.31 ±  3%      -0.7        0.58 ±  6%  perf-profile.children.cycles-pp.mark_buffer_dirty
      1.34 ±  7%      -0.7        0.61 ±  9%  perf-profile.children.cycles-pp.folio_end_writeback
      2.17 ±  8%      -0.7        1.46 ±  7%  perf-profile.children.cycles-pp.mpage_map_and_submit_extent
      1.02 ±  3%      -0.7        0.33 ± 10%  perf-profile.children.cycles-pp.__folio_mark_dirty
      2.00 ±  5%      -0.7        1.32 ± 10%  perf-profile.children.cycles-pp.ext4_finish_bio
      2.00 ±  5%      -0.7        1.32 ± 10%  perf-profile.children.cycles-pp.ext4_release_io_end
      2.03 ±  6%      -0.7        1.36 ±  9%  perf-profile.children.cycles-pp.ext4_end_io_end
      1.61 ±  7%      -0.7        0.94 ±  8%  perf-profile.children.cycles-pp.mpage_submit_folio
      1.04 ±  5%      -0.6        0.39 ± 11%  perf-profile.children.cycles-pp.folio_batch_move_lru
      1.07 ±  8%      -0.6        0.44 ±  6%  perf-profile.children.cycles-pp.__filemap_add_folio
      0.80 ±  5%      -0.6        0.21 ±  7%  perf-profile.children.cycles-pp.lru_add
      1.11 ±  6%      -0.6        0.52 ±  9%  perf-profile.children.cycles-pp.__folio_end_writeback
      1.84 ±  7%      -0.5        1.30 ±  7%  perf-profile.children.cycles-pp.create_empty_buffers
      0.99 ±  5%      -0.5        0.45 ±  5%  perf-profile.children.cycles-pp.__lruvec_stat_mod_folio
      0.68 ±  8%      -0.4        0.27 ± 11%  perf-profile.children.cycles-pp.__folio_start_writeback
      0.66 ±  7%      -0.4        0.27 ±  8%  perf-profile.children.cycles-pp.__mod_memcg_lruvec_state
      0.53 ±  7%      -0.4        0.14 ± 11%  perf-profile.children.cycles-pp.lru_gen_add_folio
      1.50 ±  7%      -0.4        1.14 ±  7%  perf-profile.children.cycles-pp.rmqueue
      1.09 ±  7%      -0.3        0.75 ±  9%  perf-profile.children.cycles-pp.ext4_bio_write_folio
      0.56 ±  5%      -0.3        0.23 ± 11%  perf-profile.children.cycles-pp.folio_account_dirtied
      0.48 ±  7%      -0.3        0.18 ±  7%  perf-profile.children.cycles-pp.folio_clear_dirty_for_io
      1.12 ±  6%      -0.3        0.82 ±  9%  perf-profile.children.cycles-pp.__rmqueue_pcplist
      0.36 ± 10%      -0.3        0.09 ±  4%  perf-profile.children.cycles-pp.fault_in_iov_iter_readable
      0.33 ± 10%      -0.2        0.08 ±  8%  perf-profile.children.cycles-pp.fault_in_readable
      0.33 ±  6%      -0.2        0.09 ± 14%  perf-profile.children.cycles-pp.__mem_cgroup_charge
      0.28 ±  8%      -0.2        0.07 ± 17%  perf-profile.children.cycles-pp.filemap_get_entry
      0.27 ±  5%      -0.2        0.06 ± 19%  perf-profile.children.cycles-pp.__xa_set_mark
      1.40 ±  8%      -0.2        1.19 ±  7%  perf-profile.children.cycles-pp.folio_alloc_buffers
      0.28 ±  9%      -0.2        0.08 ±  8%  perf-profile.children.cycles-pp.node_dirty_ok
      0.47 ± 10%      -0.2        0.28 ± 13%  perf-profile.children.cycles-pp.percpu_counter_add_batch
      1.00 ±  8%      -0.2        0.83 ±  3%  perf-profile.children.cycles-pp.xas_load
      0.21 ± 11%      -0.1        0.08        perf-profile.children.cycles-pp.__mod_node_page_state
      0.76 ±  8%      -0.1        0.64 ±  8%  perf-profile.children.cycles-pp.rmqueue_bulk
      0.20 ±  9%      -0.1        0.09 ± 18%  perf-profile.children.cycles-pp._raw_spin_lock_irq
      0.16 ± 11%      -0.1        0.06 ± 48%  perf-profile.children.cycles-pp.__mark_inode_dirty
      0.12 ± 16%      -0.1        0.02 ± 99%  perf-profile.children.cycles-pp.xas_find_conflict
      0.25 ± 11%      -0.1        0.16 ± 10%  perf-profile.children.cycles-pp.balance_dirty_pages_ratelimited_flags
      0.12 ± 17%      -0.1        0.02 ± 99%  perf-profile.children.cycles-pp.mod_zone_page_state
      0.14 ±  9%      -0.1        0.06 ± 15%  perf-profile.children.cycles-pp.charge_memcg
      0.15 ±  9%      -0.1        0.06 ± 11%  perf-profile.children.cycles-pp.cgroup_rstat_updated
      0.16 ± 12%      -0.1        0.08 ± 12%  perf-profile.children.cycles-pp.ext4_da_write_end
      0.22 ± 13%      -0.1        0.14 ± 10%  perf-profile.children.cycles-pp.xas_start
      0.35 ± 10%      -0.1        0.28 ±  8%  perf-profile.children.cycles-pp.allocate_slab
      0.12 ± 10%      -0.1        0.05 ± 47%  perf-profile.children.cycles-pp.try_charge_memcg
      0.12 ± 11%      -0.1        0.05 ±  7%  perf-profile.children.cycles-pp.__mod_zone_page_state
      0.12 ± 18%      -0.1        0.06 ± 18%  perf-profile.children.cycles-pp.__fprop_add_percpu
      0.57 ±  9%      -0.1        0.51 ±  6%  perf-profile.children.cycles-pp.__memcg_slab_post_alloc_hook
      0.63 ±  2%      -0.1        0.58 ±  4%  perf-profile.children.cycles-pp.ext4_es_lookup_extent
      0.12 ± 15%      -0.1        0.07 ± 16%  perf-profile.children.cycles-pp._raw_spin_unlock_irqrestore
      0.08 ± 20%      -0.1        0.03 ±100%  perf-profile.children.cycles-pp.xas_find_marked
      0.20 ± 12%      -0.0        0.16 ±  9%  perf-profile.children.cycles-pp.__cond_resched
      0.12 ± 10%      -0.0        0.08 ± 11%  perf-profile.children.cycles-pp.policy_nodemask
      0.14 ±  5%      -0.0        0.11 ±  8%  perf-profile.children.cycles-pp.up_write
      0.08 ±  8%      -0.0        0.06 ± 15%  perf-profile.children.cycles-pp.rcu_all_qs
      0.10 ± 13%      +0.0        0.12 ±  6%  perf-profile.children.cycles-pp.vfs_read
      0.10 ± 15%      +0.0        0.13 ±  8%  perf-profile.children.cycles-pp.ksys_read
      0.07 ± 15%      +0.0        0.11 ± 24%  perf-profile.children.cycles-pp.ext4_ext_map_blocks
      0.07 ± 14%      +0.1        0.12 ± 18%  perf-profile.children.cycles-pp.ext4_map_create_blocks
      0.08 ±  8%      +0.1        0.14 ± 19%  perf-profile.children.cycles-pp.ext4_map_blocks
      0.01 ±223%      +0.1        0.07 ± 33%  perf-profile.children.cycles-pp.ext4_mb_new_blocks
      0.30 ± 10%      +0.1        0.43 ±  8%  perf-profile.children.cycles-pp.__xa_insert
      0.37 ± 12%      +0.1        0.51 ±  9%  perf-profile.children.cycles-pp.xa_load
      0.52 ± 29%      +0.2        0.68 ±  5%  perf-profile.children.cycles-pp.io_serial_out
      1.40 ± 10%      +0.6        1.98 ±  4%  perf-profile.children.cycles-pp.clear_page_erms
      0.56 ± 31%      +0.6        1.17 ± 24%  perf-profile.children.cycles-pp.tmigr_requires_handle_remote
      0.54 ± 34%      +0.6        1.15 ± 24%  perf-profile.children.cycles-pp.get_jiffies_update
      1.98 ± 12%      +0.8        2.80 ± 10%  perf-profile.children.cycles-pp.update_process_times
      4.39 ±  8%      +0.9        5.31 ±  9%  perf-profile.children.cycles-pp.__sysvec_apic_timer_interrupt
      4.36 ±  8%      +0.9        5.29 ±  9%  perf-profile.children.cycles-pp.hrtimer_interrupt
      2.53 ± 10%      +1.0        3.53 ± 12%  perf-profile.children.cycles-pp.tick_nohz_handler
      3.44 ±  8%      +1.0        4.47 ± 10%  perf-profile.children.cycles-pp.__hrtimer_run_queues
      2.35 ±  5%      +1.0        3.39 ±  3%  perf-profile.children.cycles-pp.alloc_pages_noprof
      4.03 ± 29%      +1.3        5.28 ±  5%  perf-profile.children.cycles-pp.wait_for_lsr
      4.55 ± 29%      +1.4        5.97 ±  5%  perf-profile.children.cycles-pp.serial8250_console_write
      4.84 ± 29%      +1.5        6.32 ±  4%  perf-profile.children.cycles-pp.console_flush_all
      4.84 ± 29%      +1.5        6.32 ±  4%  perf-profile.children.cycles-pp.console_unlock
      4.85 ± 29%      +1.5        6.32 ±  4%  perf-profile.children.cycles-pp.vprintk_emit
      4.68 ± 30%      +1.5        6.17 ±  4%  perf-profile.children.cycles-pp.devkmsg_write
      4.68 ± 30%      +1.5        6.17 ±  4%  perf-profile.children.cycles-pp.devkmsg_emit
      5.02 ± 32%      +1.5        6.52        perf-profile.children.cycles-pp.memcpy_toio
      5.06 ± 32%      +1.5        6.59        perf-profile.children.cycles-pp.drm_fb_memcpy
      5.06 ± 32%      +1.5        6.59        perf-profile.children.cycles-pp.ast_primary_plane_helper_atomic_update
      5.07 ± 32%      +1.5        6.60        perf-profile.children.cycles-pp.drm_atomic_commit
      5.07 ± 32%      +1.5        6.59        perf-profile.children.cycles-pp.drm_atomic_helper_commit_planes
      5.07 ± 32%      +1.5        6.60        perf-profile.children.cycles-pp.drm_atomic_helper_commit_tail
      5.07 ± 32%      +1.5        6.60        perf-profile.children.cycles-pp.ast_mode_config_helper_atomic_commit_tail
      5.07 ± 32%      +1.5        6.60        perf-profile.children.cycles-pp.commit_tail
      5.07 ± 32%      +1.5        6.60        perf-profile.children.cycles-pp.drm_atomic_helper_commit
      5.07 ± 32%      +1.5        6.60        perf-profile.children.cycles-pp.drm_atomic_helper_dirtyfb
      5.07 ± 32%      +1.5        6.60        perf-profile.children.cycles-pp.drm_fb_helper_damage_work
      5.07 ± 32%      +1.5        6.60        perf-profile.children.cycles-pp.drm_fbdev_shmem_helper_fb_dirty
      7.25 ±  9%      +1.9        9.13 ±  2%  perf-profile.children.cycles-pp.rep_movs_alternative
      7.50 ±  9%      +1.9        9.40 ±  2%  perf-profile.children.cycles-pp.copy_page_from_iter_atomic
      3.96 ±  7%      +2.0        6.00 ±  7%  perf-profile.children.cycles-pp.memcpy_orig
      4.18 ±  7%      +2.1        6.30 ±  6%  perf-profile.children.cycles-pp.copy_to_brd
      6.53 ±  7%      +2.8        9.36 ±  3%  perf-profile.children.cycles-pp._raw_spin_lock
      8.66 ±  6%      +4.3       12.93 ±  3%  perf-profile.children.cycles-pp.brd_insert_page
     17.79 ±  4%      +5.1       22.93        perf-profile.children.cycles-pp.__writeback_single_inode
     17.79 ±  4%      +5.1       22.93        perf-profile.children.cycles-pp.ext4_writepages
     17.79 ±  4%      +5.1       22.93        perf-profile.children.cycles-pp.do_writepages
     17.79 ±  4%      +5.1       22.93        perf-profile.children.cycles-pp.ext4_do_writepages
     17.80 ±  4%      +5.1       22.94        perf-profile.children.cycles-pp.writeback_sb_inodes
     17.80 ±  4%      +5.1       22.94        perf-profile.children.cycles-pp.wb_do_writeback
     17.80 ±  4%      +5.1       22.94        perf-profile.children.cycles-pp.wb_workfn
     17.80 ±  4%      +5.1       22.94        perf-profile.children.cycles-pp.wb_writeback
     12.92 ±  4%      +6.4       19.36        perf-profile.children.cycles-pp.brd_submit_bio
     12.92 ±  4%      +6.4       19.37        perf-profile.children.cycles-pp.__submit_bio
     12.92 ±  4%      +6.4       19.37        perf-profile.children.cycles-pp.__submit_bio_noacct
     12.89 ±  4%      +6.5       19.37        perf-profile.children.cycles-pp.ext4_io_submit
     23.05 ±  4%      +6.7       29.71        perf-profile.children.cycles-pp.kthread
     22.96 ±  4%      +6.7       29.61        perf-profile.children.cycles-pp.process_one_work
     23.06 ±  4%      +6.7       29.71        perf-profile.children.cycles-pp.ret_from_fork_asm
     23.05 ±  4%      +6.7       29.71        perf-profile.children.cycles-pp.ret_from_fork
     22.98 ±  4%      +6.7       29.64        perf-profile.children.cycles-pp.worker_thread
      4.33 ±  4%      -3.8        0.53 ±  8%  perf-profile.self.cycles-pp.__block_commit_write
      0.96 ±  6%      -0.4        0.57 ±  8%  perf-profile.self.cycles-pp._raw_spin_lock_irqsave
      0.52 ±  8%      -0.3        0.21 ± 10%  perf-profile.self.cycles-pp.__mod_memcg_lruvec_state
      0.37 ±  6%      -0.3        0.10 ±  8%  perf-profile.self.cycles-pp.lru_gen_add_folio
      0.34 ±  6%      -0.3        0.07 ±  5%  perf-profile.self.cycles-pp.__filemap_add_folio
      0.32 ±  8%      -0.3        0.06 ± 45%  perf-profile.self.cycles-pp.ext4_da_do_write_end
      0.33 ±  9%      -0.2        0.08 ± 10%  perf-profile.self.cycles-pp.fault_in_readable
      0.33 ± 11%      -0.2        0.12 ± 10%  perf-profile.self.cycles-pp.__folio_end_writeback
      0.26 ±  8%      -0.2        0.07 ± 20%  perf-profile.self.cycles-pp.lru_add
      0.36 ±  8%      -0.2        0.18 ± 16%  perf-profile.self.cycles-pp.__rmqueue_pcplist
      0.37 ± 10%      -0.2        0.19 ± 18%  perf-profile.self.cycles-pp.percpu_counter_add_batch
      0.24 ±  7%      -0.2        0.07 ± 14%  perf-profile.self.cycles-pp.create_empty_buffers
      0.69 ±  7%      -0.2        0.53 ±  6%  perf-profile.self.cycles-pp.rmqueue_bulk
      0.24 ±  9%      -0.2        0.08 ±  8%  perf-profile.self.cycles-pp.__folio_start_writeback
      0.27 ±  7%      -0.1        0.12 ±  8%  perf-profile.self.cycles-pp.ext4_block_write_begin
      0.22 ±  9%      -0.1        0.08 ± 12%  perf-profile.self.cycles-pp.folio_clear_dirty_for_io
      0.26 ±  9%      -0.1        0.12 ± 13%  perf-profile.self.cycles-pp.folios_put_refs
      0.22 ± 12%      -0.1        0.08 ±  8%  perf-profile.self.cycles-pp.folio_end_writeback
      0.29 ±  7%      -0.1        0.15 ±  5%  perf-profile.self.cycles-pp.__lruvec_stat_mod_folio
      0.19 ± 11%      -0.1        0.06 ±  9%  perf-profile.self.cycles-pp.node_dirty_ok
      0.16 ±  4%      -0.1        0.05 ± 49%  perf-profile.self.cycles-pp.ext4_da_write_begin
      0.18 ± 11%      -0.1        0.07 ±  5%  perf-profile.self.cycles-pp.__mod_node_page_state
      0.20 ±  7%      -0.1        0.08 ± 17%  perf-profile.self.cycles-pp._raw_spin_lock_irq
      0.16 ± 13%      -0.1        0.07 ± 12%  perf-profile.self.cycles-pp.ext4_da_write_end
      0.32 ± 12%      -0.1        0.23 ± 15%  perf-profile.self.cycles-pp.get_page_from_freelist
      0.11 ±  8%      -0.1        0.02 ± 99%  perf-profile.self.cycles-pp.__mod_zone_page_state
      0.14 ±  8%      -0.1        0.06 ± 19%  perf-profile.self.cycles-pp.mpage_prepare_extent_to_map
      0.14 ±  9%      -0.1        0.06 ± 11%  perf-profile.self.cycles-pp.cgroup_rstat_updated
      0.21 ± 13%      -0.1        0.13 ± 10%  perf-profile.self.cycles-pp.xas_start
      0.14 ± 37%      -0.1        0.06 ± 21%  perf-profile.self.cycles-pp.folio_alloc_buffers
      0.34 ±  9%      -0.1        0.27 ± 11%  perf-profile.self.cycles-pp.__alloc_frozen_pages_noprof
      0.16 ± 19%      -0.1        0.09 ± 12%  perf-profile.self.cycles-pp.generic_perform_write
      0.08 ± 20%      -0.1        0.03 ±100%  perf-profile.self.cycles-pp.xas_find_marked
      0.11 ± 18%      -0.0        0.06 ± 19%  perf-profile.self.cycles-pp._raw_spin_unlock_irqrestore
      0.20 ± 11%      -0.0        0.16 ±  6%  perf-profile.self.cycles-pp._raw_spin_trylock
      0.14 ±  7%      -0.0        0.11 ±  8%  perf-profile.self.cycles-pp.up_write
      0.05 ± 48%      +0.0        0.08 ± 21%  perf-profile.self.cycles-pp.bvec_try_merge_page
      0.04 ± 71%      +0.0        0.08 ± 10%  perf-profile.self.cycles-pp.alloc_pages_noprof
      0.04 ± 71%      +0.0        0.08 ± 22%  perf-profile.self.cycles-pp.__xa_insert
      0.03 ±101%      +0.1        0.09 ± 17%  perf-profile.self.cycles-pp.update_process_times
      0.09 ± 17%      +0.1        0.14 ± 13%  perf-profile.self.cycles-pp.brd_submit_bio
      0.20 ±  9%      +0.1        0.27 ± 10%  perf-profile.self.cycles-pp.ext4_bio_write_folio
      0.00            +0.1        0.13 ±  8%  perf-profile.self.cycles-pp.folio_alloc_noprof
      0.52 ± 29%      +0.2        0.68 ±  5%  perf-profile.self.cycles-pp.io_serial_out
      0.29 ± 36%      +0.2        0.51 ± 26%  perf-profile.self.cycles-pp.tick_nohz_handler
      0.20 ±  8%      +0.3        0.47 ± 14%  perf-profile.self.cycles-pp.ext4_finish_bio
      1.39 ± 10%      +0.6        1.96 ±  3%  perf-profile.self.cycles-pp.clear_page_erms
      0.54 ± 34%      +0.6        1.15 ± 24%  perf-profile.self.cycles-pp.get_jiffies_update
      4.90 ± 32%      +1.4        6.34        perf-profile.self.cycles-pp.memcpy_toio
      7.20 ±  9%      +1.9        9.08 ±  2%  perf-profile.self.cycles-pp.rep_movs_alternative
      3.94 ±  7%      +2.0        5.96 ±  6%  perf-profile.self.cycles-pp.memcpy_orig
      6.48 ±  7%      +2.8        9.28 ±  3%  perf-profile.self.cycles-pp._raw_spin_lock




Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.
Ojaswin Mujoo May 20, 2025, 10:48 a.m. UTC | #2
On Mon, May 12, 2025 at 02:33:19PM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> Besides fsverity, fscrypt, and the data=journal mode, ext4 now supports
> large folios for regular files. Enable this feature by default. However,
> since we cannot change the folio order limitation of mappings on active
> inodes, setting the journal=data mode via ioctl on an active inode will
> not take immediate effect in non-delalloc mode.
> 

Looks good:

Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>

Thanks,
Ojaswin

> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
>  fs/ext4/ext4.h      |  1 +
>  fs/ext4/ext4_jbd2.c |  3 ++-
>  fs/ext4/ialloc.c    |  3 +++
>  fs/ext4/inode.c     | 20 ++++++++++++++++++++
>  4 files changed, 26 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 5a20e9cd7184..2fad90c30493 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2993,6 +2993,7 @@ int ext4_walk_page_buffers(handle_t *handle,
>  				     struct buffer_head *bh));
>  int do_journal_get_write_access(handle_t *handle, struct inode *inode,
>  				struct buffer_head *bh);
> +bool ext4_should_enable_large_folio(struct inode *inode);
>  #define FALL_BACK_TO_NONDELALLOC 1
>  #define CONVERT_INLINE_DATA	 2
>  
> diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
> index 135e278c832e..b3e9b7bd7978 100644
> --- a/fs/ext4/ext4_jbd2.c
> +++ b/fs/ext4/ext4_jbd2.c
> @@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode)
>  	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
>  	    test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
>  	    (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
> -	    !test_opt(inode->i_sb, DELALLOC))) {
> +	    !test_opt(inode->i_sb, DELALLOC) &&
> +	    !mapping_large_folio_support(inode->i_mapping))) {
>  		/* We do not support data journalling for encrypted data */
>  		if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
>  			return EXT4_INODE_ORDERED_DATA_MODE;  /* ordered */
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index e7ecc7c8a729..4938e78cbadc 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -1336,6 +1336,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
>  		}
>  	}
>  
> +	if (ext4_should_enable_large_folio(inode))
> +		mapping_set_large_folios(inode->i_mapping);
> +
>  	ext4_update_inode_fsync_trans(handle, inode, 1);
>  
>  	err = ext4_mark_inode_dirty(handle, inode);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 29eccdf8315a..7fd3921cfe46 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4774,6 +4774,23 @@ static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
>  	return -EFSCORRUPTED;
>  }
>  
> +bool ext4_should_enable_large_folio(struct inode *inode)
> +{
> +	struct super_block *sb = inode->i_sb;
> +
> +	if (!S_ISREG(inode->i_mode))
> +		return false;
> +	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
> +	    ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
> +		return false;
> +	if (ext4_has_feature_verity(sb))
> +		return false;
> +	if (ext4_has_feature_encrypt(sb))
> +		return false;
> +
> +	return true;
> +}
> +
>  struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
>  			  ext4_iget_flags flags, const char *function,
>  			  unsigned int line)
> @@ -5096,6 +5113,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
>  		ret = -EFSCORRUPTED;
>  		goto bad_inode;
>  	}
> +	if (ext4_should_enable_large_folio(inode))
> +		mapping_set_large_folios(inode->i_mapping);
> +
>  	ret = check_igot_inode(inode, flags, function, line);
>  	/*
>  	 * -ESTALE here means there is nothing inherently wrong with the inode,
> -- 
> 2.46.1
>
Lai, Yi June 25, 2025, 8:14 a.m. UTC | #3
Hi Zhang Yi,

Greetings!

I used Syzkaller and found that there is general protection fault in try_to_unmap_one in linux-next next-20250623.

After bisection and the first bad commit is:
"
7ac67301e82f ext4: enable large folio for regular file
"

All detailed into can be found at:
https://github.com/laifryiee/syzkaller_logs/tree/main/250624_222435_try_to_unmap_one
Syzkaller repro code:
https://github.com/laifryiee/syzkaller_logs/tree/main/250624_222435_try_to_unmap_one/repro.c
Syzkaller repro syscall steps:
https://github.com/laifryiee/syzkaller_logs/tree/main/250624_222435_try_to_unmap_one/repro.prog
Syzkaller report:
https://github.com/laifryiee/syzkaller_logs/tree/main/250624_222435_try_to_unmap_one/repro.report
Kconfig(make olddefconfig):
https://github.com/laifryiee/syzkaller_logs/tree/main/250624_222435_try_to_unmap_one/kconfig_origin
Bisect info:
https://github.com/laifryiee/syzkaller_logs/tree/main/250624_222435_try_to_unmap_one/bisect_info.log
bzImage:
https://github.com/laifryiee/syzkaller_logs/raw/refs/heads/main/250624_222435_try_to_unmap_one/bzImage_86731a2a651e58953fc949573895f2fa6d456841
Issue dmesg:
https://github.com/laifryiee/syzkaller_logs/blob/main/250624_222435_try_to_unmap_one/86731a2a651e58953fc949573895f2fa6d456841_dmesg.log

"
[   48.166741] Injecting memory failure for pfn 0x28c00 at process virtual address 0x20ffc000
[   48.167878] Memory failure: 0x28c00: Sending SIGBUS to repro:668 due to hardware memory corruption
[   48.169079] Memory failure: 0x28c00: recovery action for unsplit thp: Failed
[   48.657334] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] SMP KASI
[   48.658081] KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
[   48.658561] CPU: 0 UID: 0 PID: 675 Comm: repro Not tainted 6.16.0-rc3-86731a2a651e #1 PREEMPT(voluntary)
[   48.659153] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org4
[   48.659862] RIP: 0010:try_to_unmap_one+0x4ef/0x3860
[   48.660204] Code: f5 a5 ff 48 8b 9d 78 ff ff ff 49 8d 46 18 48 89 85 70 fe ff ff 48 85 db 0f 84 96 1a 00 00 e8 c8 f58
[   48.661345] RSP: 0018:ffff88801a55ebc0 EFLAGS: 00010246
[   48.661685] RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff81e1a1a1
[   48.662136] RDX: ffff888014502540 RSI: ffffffff81e186c8 RDI: 0000000000000005
[   48.662584] RBP: ffff88801a55ed90 R08: 0000000000000001 R09: ffffed10034abd3b
[   48.663030] R10: 0000000000000000 R11: ffff888014503398 R12: 0000000020e00000
[   48.663490] R13: ffffea0000a30000 R14: ffffea0000a30000 R15: dffffc0000000000
[   48.663950] FS:  00007f2e4c104740(0000) GS:ffff8880e3562000(0000) knlGS:0000000000000000
[   48.664464] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   48.664836] CR2: 0000000021000000 CR3: 00000000115ae003 CR4: 0000000000770ef0
[   48.665297] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   48.665756] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400
[   48.666210] PKRU: 55555554
[   48.666398] Call Trace:
[   48.666569]  <TASK>
[   48.666729]  ? __pfx_try_to_unmap_one+0x10/0x10
[   48.667048]  __rmap_walk_file+0x2a5/0x4a0
[   48.667324]  rmap_walk+0x16b/0x1f0
[   48.667563]  try_to_unmap+0x12f/0x140
[   48.667818]  ? __pfx_try_to_unmap+0x10/0x10
[   48.668104]  ? __pfx_try_to_unmap_one+0x10/0x10
[   48.668408]  ? __pfx_folio_not_mapped+0x10/0x10
[   48.668713]  ? __pfx_folio_lock_anon_vma_read+0x10/0x10
[   48.669066]  ? __sanitizer_cov_trace_const_cmp8+0x1c/0x30
[   48.669438]  unmap_poisoned_folio+0x130/0x500
[   48.669743]  shrink_folio_list+0x44f/0x3d90
[   48.670036]  ? stack_depot_save_flags+0x445/0xa40
[   48.670366]  ? __this_cpu_preempt_check+0x21/0x30
[   48.670711]  ? lockdep_hardirqs_on+0x89/0x110
[   48.671014]  ? __pfx_shrink_folio_list+0x10/0x10
[   48.671325]  ? is_bpf_text_address+0x94/0x1b0
[   48.671628]  ? debug_smp_processor_id+0x20/0x30
[   48.671937]  ? is_bpf_text_address+0x9e/0x1b0
[   48.672232]  ? kernel_text_address+0xd3/0xe0
[   48.672538]  ? __kernel_text_address+0x16/0x50
[   48.672845]  ? unwind_get_return_address+0x65/0xb0
[   48.673178]  ? __pfx_stack_trace_consume_entry+0x10/0x10
[   48.673540]  ? arch_stack_walk+0xa1/0xf0
[   48.673826]  reclaim_folio_list+0xe2/0x4c0
[   48.674104]  ? check_path.constprop.0+0x28/0x50
[   48.674422]  ? __pfx_reclaim_folio_list+0x10/0x10
[   48.674750]  ? folio_isolate_lru+0x38c/0x590
[   48.675047]  reclaim_pages+0x393/0x560
[   48.675306]  ? __pfx_reclaim_pages+0x10/0x10
[   48.675605]  ? do_raw_spin_unlock+0x15c/0x210
[   48.675900]  madvise_cold_or_pageout_pte_range+0x1cac/0x2800
[   48.676287]  ? __pfx_madvise_cold_or_pageout_pte_range+0x10/0x10
[   48.676687]  ? lock_is_held_type+0xef/0x150
[   48.676975]  ? __pfx_madvise_cold_or_pageout_pte_range+0x10/0x10
[   48.677372]  walk_pgd_range+0xe2d/0x2420
[   48.677654]  ? __pfx_walk_pgd_range+0x10/0x10
[   48.677955]  __walk_page_range+0x177/0x810
[   48.678236]  ? find_vma+0xc4/0x140
[   48.678478]  ? __pfx_find_vma+0x10/0x10
[   48.678746]  ? __this_cpu_preempt_check+0x21/0x30
[   48.679062]  ? __sanitizer_cov_trace_const_cmp8+0x1c/0x30
[   48.679428]  walk_page_range_mm+0x39f/0x770
[   48.679718]  ? __pfx_walk_page_range_mm+0x10/0x10
[   48.680038]  ? __this_cpu_preempt_check+0x21/0x30
[   48.680355]  ? __sanitizer_cov_trace_const_cmp4+0x1a/0x20
[   48.680713]  ? mlock_drain_local+0x27f/0x4b0
[   48.681006]  walk_page_range+0x70/0xa0
[   48.681263]  ? __kasan_check_write+0x18/0x20
[   48.681562]  madvise_do_behavior+0x13e3/0x35f0
[   48.681874]  ? copy_vma_and_data+0x353/0x7d0
[   48.682169]  ? __pfx_madvise_do_behavior+0x10/0x10
[   48.682497]  ? __pfx_arch_get_unmapped_area_topdown+0x10/0x10
[   48.682885]  ? __this_cpu_preempt_check+0x21/0x30
[   48.683203]  ? lock_is_held_type+0xef/0x150
[   48.683494]  ? __lock_acquire+0x412/0x22a0
[   48.683789]  ? __this_cpu_preempt_check+0x21/0x30
[   48.684108]  ? lock_acquire+0x180/0x310
[   48.684381]  ? __pfx_down_read+0x10/0x10
[   48.684656]  ? __lock_acquire+0x412/0x22a0
[   48.684953]  ? __pfx___do_sys_mremap+0x10/0x10
[   48.685257]  ? __sanitizer_cov_trace_switch+0x58/0xa0
[   48.685603]  do_madvise+0x193/0x2b0
[   48.685852]  ? do_madvise+0x193/0x2b0
[   48.686122]  ? __pfx_do_madvise+0x10/0x10
[   48.686401]  ? __this_cpu_preempt_check+0x21/0x30
[   48.686715]  ? seqcount_lockdep_reader_access.constprop.0+0xb4/0xd0
[   48.687154]  ? lockdep_hardirqs_on+0x89/0x110
[   48.687457]  ? trace_hardirqs_on+0x51/0x60
[   48.687751]  ? seqcount_lockdep_reader_access.constprop.0+0xc0/0xd0
[   48.688162]  ? __sanitizer_cov_trace_cmp4+0x1a/0x20
[   48.688492]  ? ktime_get_coarse_real_ts64+0xad/0xf0
[   48.688823]  ? __audit_syscall_entry+0x39c/0x500
[   48.689134]  __x64_sys_madvise+0xb2/0x120
[   48.689411]  ? syscall_trace_enter+0x14d/0x280
[   48.689720]  x64_sys_call+0x19ac/0x2150
[   48.689987]  do_syscall_64+0x6d/0x2e0
[   48.690248]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   48.690583] RIP: 0033:0x7f2e4be3ee5d
[   48.690842] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d8
[   48.692016] RSP: 002b:00007ffeb3fe8e68 EFLAGS: 00000217 ORIG_RAX: 000000000000001c
[   48.692503] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f2e4be3ee5d
[   48.692978] RDX: 0000000000000015 RSI: 0000000000c00000 RDI: 0000000020400000
[   48.693435] RBP: 00007ffeb3fe8e80 R08: 00007ffeb3fe8e80 R09: 00007ffeb3fe8e80
[   48.693886] R10: 0000000020fc6000 R11: 0000000000000217 R12: 00007ffeb3fe8fd8
[   48.694344] R13: 00000000004018e5 R14: 0000000000403e08 R15: 00007f2e4c151000
[   48.694811]  </TASK>
[   48.694967] Modules linked in:
[   48.695320] ---[ end trace 0000000000000000 ]---
"

Hope this cound be insightful to you.

Regards,
Yi Lai

---

If you don't need the following environment to reproduce the problem or if you
already have one reproduced environment, please ignore the following information.

How to reproduce:
git clone https://gitlab.com/xupengfe/repro_vm_env.git
cd repro_vm_env
tar -xvf repro_vm_env.tar.gz
cd repro_vm_env; ./start3.sh  // it needs qemu-system-x86_64 and I used v7.1.0
  // start3.sh will load bzImage_2241ab53cbb5cdb08a6b2d4688feb13971058f65 v6.2-rc5 kernel
  // You could change the bzImage_xxx as you want
  // Maybe you need to remove line "-drive if=pflash,format=raw,readonly=on,file=./OVMF_CODE.fd \" for different qemu version
You could use below command to log in, there is no password for root.
ssh -p 10023 root@localhost

After login vm(virtual machine) successfully, you could transfer reproduced
binary to the vm by below way, and reproduce the problem in vm:
gcc -pthread -o repro repro.c
scp -P 10023 repro root@localhost:/root/

Get the bzImage for target kernel:
Please use target kconfig and copy it to kernel_src/.config
make olddefconfig
make -jx bzImage           //x should equal or less than cpu num your pc has

Fill the bzImage file into above start3.sh to load the target kernel in vm.


Tips:
If you already have qemu-system-x86_64, please ignore below info.
If you want to install qemu v7.1.0 version:
git clone https://github.com/qemu/qemu.git
cd qemu
git checkout -f v7.1.0
mkdir build
cd build
yum install -y ninja-build.x86_64
yum -y install libslirp-devel.x86_64
../configure --target-list=x86_64-softmmu --enable-kvm --enable-vnc --enable-gtk --enable-sdl --enable-usb-redir --enable-slirp
make
make install 


On Mon, May 12, 2025 at 02:33:19PM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> Besides fsverity, fscrypt, and the data=journal mode, ext4 now supports
> large folios for regular files. Enable this feature by default. However,
> since we cannot change the folio order limitation of mappings on active
> inodes, setting the journal=data mode via ioctl on an active inode will
> not take immediate effect in non-delalloc mode.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
>  fs/ext4/ext4.h      |  1 +
>  fs/ext4/ext4_jbd2.c |  3 ++-
>  fs/ext4/ialloc.c    |  3 +++
>  fs/ext4/inode.c     | 20 ++++++++++++++++++++
>  4 files changed, 26 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 5a20e9cd7184..2fad90c30493 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2993,6 +2993,7 @@ int ext4_walk_page_buffers(handle_t *handle,
>  				     struct buffer_head *bh));
>  int do_journal_get_write_access(handle_t *handle, struct inode *inode,
>  				struct buffer_head *bh);
> +bool ext4_should_enable_large_folio(struct inode *inode);
>  #define FALL_BACK_TO_NONDELALLOC 1
>  #define CONVERT_INLINE_DATA	 2
>  
> diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
> index 135e278c832e..b3e9b7bd7978 100644
> --- a/fs/ext4/ext4_jbd2.c
> +++ b/fs/ext4/ext4_jbd2.c
> @@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode)
>  	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
>  	    test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
>  	    (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
> -	    !test_opt(inode->i_sb, DELALLOC))) {
> +	    !test_opt(inode->i_sb, DELALLOC) &&
> +	    !mapping_large_folio_support(inode->i_mapping))) {
>  		/* We do not support data journalling for encrypted data */
>  		if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
>  			return EXT4_INODE_ORDERED_DATA_MODE;  /* ordered */
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index e7ecc7c8a729..4938e78cbadc 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -1336,6 +1336,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
>  		}
>  	}
>  
> +	if (ext4_should_enable_large_folio(inode))
> +		mapping_set_large_folios(inode->i_mapping);
> +
>  	ext4_update_inode_fsync_trans(handle, inode, 1);
>  
>  	err = ext4_mark_inode_dirty(handle, inode);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 29eccdf8315a..7fd3921cfe46 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4774,6 +4774,23 @@ static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
>  	return -EFSCORRUPTED;
>  }
>  
> +bool ext4_should_enable_large_folio(struct inode *inode)
> +{
> +	struct super_block *sb = inode->i_sb;
> +
> +	if (!S_ISREG(inode->i_mode))
> +		return false;
> +	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
> +	    ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
> +		return false;
> +	if (ext4_has_feature_verity(sb))
> +		return false;
> +	if (ext4_has_feature_encrypt(sb))
> +		return false;
> +
> +	return true;
> +}
> +
>  struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
>  			  ext4_iget_flags flags, const char *function,
>  			  unsigned int line)
> @@ -5096,6 +5113,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
>  		ret = -EFSCORRUPTED;
>  		goto bad_inode;
>  	}
> +	if (ext4_should_enable_large_folio(inode))
> +		mapping_set_large_folios(inode->i_mapping);
> +
>  	ret = check_igot_inode(inode, flags, function, line);
>  	/*
>  	 * -ESTALE here means there is nothing inherently wrong with the inode,
> -- 
> 2.46.1
>
Theodore Ts'o June 25, 2025, 1:15 p.m. UTC | #4
It looks like this failure requires using madvise() with MADV_HWPOISON
(which requires root) and MADV_PAGEOUT, and the stack trace is in deep
in the an mm codepath:

   madvise_cold_or_pageout_pte_range+0x1cac/0x2800
      reclaim_pages+0x393/0x560
         reclaim_folio_list+0xe2/0x4c0
            shrink_folio_list+0x44f/0x3d90
                unmap_poisoned_folio+0x130/0x500
                    try_to_unmap+0x12f/0x140
                       rmap_walk+0x16b/0x1f0
		       ...

The bisected commit is the one which enables using large folios, so
while it's possible that this due to ext4 doing something not quite
right when using large folios, it's also posible that this might be a
bug in the folio/mm code paths.

Does this reproduce on other file systems, such as XFS?

     	  	       	     	  	   	- Ted
Lai, Yi June 26, 2025, 3:35 a.m. UTC | #5
On Wed, Jun 25, 2025 at 09:15:45AM -0400, Theodore Ts'o wrote:
> It looks like this failure requires using madvise() with MADV_HWPOISON
> (which requires root) and MADV_PAGEOUT, and the stack trace is in deep
> in the an mm codepath:
> 
>    madvise_cold_or_pageout_pte_range+0x1cac/0x2800
>       reclaim_pages+0x393/0x560
>          reclaim_folio_list+0xe2/0x4c0
>             shrink_folio_list+0x44f/0x3d90
>                 unmap_poisoned_folio+0x130/0x500
>                     try_to_unmap+0x12f/0x140
>                        rmap_walk+0x16b/0x1f0
> 		       ...
> 
> The bisected commit is the one which enables using large folios, so
> while it's possible that this due to ext4 doing something not quite
> right when using large folios, it's also posible that this might be a
> bug in the folio/mm code paths.
> 
> Does this reproduce on other file systems, such as XFS?
>

Indeed, this issue can also be reproduced on XFS file system. Thanks for the advice. I will conduct cross-filesystem validation next time when I encounter ext4 issue.

[  395.888267] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] SMP KASI
[  395.888767] KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
[  395.889150] CPU: 2 UID: 0 PID: 7420 Comm: repro Not tainted 6.16.0-rc3-86731a2a651e #1 PREEMPT(voluntary)
[  395.889620] Hardware name: Red Hat KVM/RHEL, BIOS edk2-20241117-3.el9 11/17/2024
[  395.889967] RIP: 0010:try_to_unmap_one+0x4ef/0x3860
[  395.890230] Code: f5 a5 ff 48 8b 9d 78 ff ff ff 49 8d 46 18 48 89 85 70 fe ff ff 48 85 db 0f 84 96 1a 00 00 e8 c8 f58
[  395.891081] RSP: 0018:ff1100011869ebc0 EFLAGS: 00010246
[  395.891337] RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff81e1a1a1
[  395.891676] RDX: ff11000130330000 RSI: ffffffff81e186c8 RDI: 0000000000000005
[  395.892018] RBP: ff1100011869ed90 R08: 0000000000000001 R09: ffe21c00230d3d3b
[  395.892356] R10: 0000000000000000 R11: ff11000130330e58 R12: 0000000020e00000
[  395.892691] R13: ffd40000043c8000 R14: ffd40000043c8000 R15: dffffc0000000000
[  395.893043] FS:  00007fbd34523740(0000) GS:ff110004a4e62000(0000) knlGS:0000000000000000
[  395.893437] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  395.893718] CR2: 0000000021000000 CR3: 000000010f8bf004 CR4: 0000000000771ef0
[  395.894060] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  395.894398] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
[  395.894732] PKRU: 55555554
[  395.894868] Call Trace:
[  395.894991]  <TASK>
[  395.895109]  ? __pfx_try_to_unmap_one+0x10/0x10
[  395.895337]  __rmap_walk_file+0x2a5/0x4a0
[  395.895538]  rmap_walk+0x16b/0x1f0
[  395.895706]  try_to_unmap+0x12f/0x140
[  395.895853]  ? __pfx_try_to_unmap+0x10/0x10
[  395.896061]  ? __pfx_try_to_unmap_one+0x10/0x10
[  395.896284]  ? __pfx_folio_not_mapped+0x10/0x10
[  395.896504]  ? __pfx_folio_lock_anon_vma_read+0x10/0x10
[  395.896758]  ? __sanitizer_cov_trace_const_cmp8+0x1c/0x30
[  395.897025]  unmap_poisoned_folio+0x130/0x500
[  395.897251]  shrink_folio_list+0x44f/0x3d90
[  395.897476]  ? __pfx_shrink_folio_list+0x10/0x10
[  395.897719]  ? is_bpf_text_address+0x94/0x1b0
[  395.897941]  ? debug_smp_processor_id+0x20/0x30
[  395.898172]  ? is_bpf_text_address+0x9e/0x1b0
[  395.898387]  ? kernel_text_address+0xd3/0xe0
[  395.898604]  ? __kernel_text_address+0x16/0x50
[  395.898827]  ? unwind_get_return_address+0x65/0xb0
[  395.899066]  ? __pfx_stack_trace_consume_entry+0x10/0x10
[  395.899326]  ? arch_stack_walk+0xa1/0xf0
[  395.899530]  reclaim_folio_list+0xe2/0x4c0
[  395.899733]  ? check_path.constprop.0+0x28/0x50
[  395.899963]  ? __pfx_reclaim_folio_list+0x10/0x10
[  395.900198]  ? folio_isolate_lru+0x38c/0x590
[  395.900412]  reclaim_pages+0x393/0x560
[  395.900606]  ? __pfx_reclaim_pages+0x10/0x10
[  395.900824]  ? do_raw_spin_unlock+0x15c/0x210
[  395.901044]  madvise_cold_or_pageout_pte_range+0x1cac/0x2800
[  395.901326]  ? __pfx_madvise_cold_or_pageout_pte_range+0x10/0x10
[  395.901631]  ? lock_is_held_type+0xef/0x150
[  395.901852]  ? __pfx_madvise_cold_or_pageout_pte_range+0x10/0x10
[  395.902158]  walk_pgd_range+0xe2d/0x2420
[  395.902373]  ? __pfx_walk_pgd_range+0x10/0x10
[  395.902593]  __walk_page_range+0x177/0x810
[  395.902799]  ? find_vma+0xc4/0x140
[  395.902977]  ? __pfx_find_vma+0x10/0x10
[  395.903176]  ? __this_cpu_preempt_check+0x21/0x30
[  395.903401]  ? __sanitizer_cov_trace_const_cmp8+0x1c/0x30
[  395.903667]  walk_page_range_mm+0x39f/0x770
[  395.903877]  ? __pfx_walk_page_range_mm+0x10/0x10
[  395.904109]  ? __this_cpu_preempt_check+0x21/0x30
[  395.904340]  ? __sanitizer_cov_trace_const_cmp4+0x1a/0x20
[  395.904606]  ? mlock_drain_local+0x27f/0x4b0
[  395.904826]  walk_page_range+0x70/0xa0
[  395.905013]  ? __kasan_check_write+0x18/0x20
[  395.905227]  madvise_do_behavior+0x13e3/0x35f0
[  395.905453]  ? copy_vma_and_data+0x353/0x7d0
[  395.905674]  ? __pfx_madvise_do_behavior+0x10/0x10
[  395.905922]  ? __pfx_arch_get_unmapped_area_topdown+0x10/0x10
[  395.906219]  ? __this_cpu_preempt_check+0x21/0x30
[  395.906455]  ? lock_is_held_type+0xef/0x150
[  395.906665]  ? __lock_acquire+0x412/0x22a0
[  395.906875]  ? __this_cpu_preempt_check+0x21/0x30
[  395.907105]  ? lock_acquire+0x180/0x310
[  395.907306]  ? __pfx_down_read+0x10/0x10
[  395.907503]  ? __lock_acquire+0x412/0x22a0
[  395.907707]  ? __pfx___do_sys_mremap+0x10/0x10
[  395.907929]  ? __sanitizer_cov_trace_switch+0x58/0xa0
[  395.908186]  do_madvise+0x193/0x2b0
[  395.908363]  ? do_madvise+0x193/0x2b0
[  395.908550]  ? __pfx_do_madvise+0x10/0x10
[  395.908801]  ? __this_cpu_preempt_check+0x21/0x30
[  395.909036]  ? seqcount_lockdep_reader_access.constprop.0+0xb4/0xd0
[  395.909335]  ? lockdep_hardirqs_on+0x89/0x110
[  395.909556]  ? trace_hardirqs_on+0x51/0x60
[  395.909763]  ? seqcount_lockdep_reader_access.constprop.0+0xc0/0xd0
[  395.910073]  ? __sanitizer_cov_trace_cmp4+0x1a/0x20
[  395.910332]  ? ktime_get_coarse_real_ts64+0xad/0xf0
[  395.910578]  ? __audit_syscall_entry+0x39c/0x500
[  395.910812]  __x64_sys_madvise+0xb2/0x120
[  395.911016]  ? syscall_trace_enter+0x14d/0x280
[  395.911240]  x64_sys_call+0x19ac/0x2150
[  395.911431]  do_syscall_64+0x6d/0x2e0
[  395.911619]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  395.911865] RIP: 0033:0x7fbd3430756d
[  395.912046] Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d8
[  395.912905] RSP: 002b:00007ffe6486ec48 EFLAGS: 00000217 ORIG_RAX: 000000000000001c
[  395.913267] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fbd3430756d
[  395.913603] RDX: 0000000000000015 RSI: 0000000000c00000 RDI: 0000000020400000
[  395.913941] RBP: 00007ffe6486ec60 R08: 00007ffe6486ec60 R09: 00007ffe6486ec60
[  395.914280] R10: 0000000020fc6000 R11: 0000000000000217 R12: 00007ffe6486edb8
[  395.914629] R13: 00000000004018e5 R14: 0000000000403e08 R15: 00007fbd3456a000
[  395.914989]  </TASK>
[  395.915111] Modules linked in:
[  395.915296] ---[ end trace 0000000000000000 ]---

FYI, there is ongoing discussion in terms of folio/mm domain - https://lore.kernel.org/all/20250611074643.250837-1-tujinjiang@huawei.com/T/

Regards,
Yi Lai

 
>      	  	       	     	  	   	- Ted
D, Suneeth June 26, 2025, 11:29 a.m. UTC | #6
Hello Zhang Yi,

On 5/12/2025 12:03 PM, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> Besides fsverity, fscrypt, and the data=journal mode, ext4 now supports
> large folios for regular files. Enable this feature by default. However,
> since we cannot change the folio order limitation of mappings on active
> inodes, setting the journal=data mode via ioctl on an active inode will
> not take immediate effect in non-delalloc mode.
> 

We run lmbench3 as part of our Weekly CI for the purpose of Kernel 
Performance Regression testing between a stable vs rc kernel. We noticed 
a regression on the kernels starting from 6.16-rc1 all the way through 
6.16-rc3 in the range of 8-12%. Further bisection b/w 6.15 and 6.16-rc1 
pointed me to the first bad commit as 
7ac67301e82f02b77a5c8e7377a1f414ef108b84. The following were the machine 
configurations and test parameters used:-

Model name:           AMD EPYC 9754 128-Core Processor [Bergamo]
Thread(s) per core:   2
Core(s) per socket:   128
Socket(s):            1
Total online memory:  258G

micro-benchmark_variant: "lmbench3-development-1-0-MMAP-50%" which has 
the following parameters,

-> nr_thread: 	1
-> memory_size: 50%
-> mode: 	development
-> test:        MMAP

The following are the stats after bisection:-

(the KPI used here is lmbench3.MMAP.read.latency.us)

v6.15 - 						97.3K

v6.16-rc1 - 						107.5K

v6.16-rc3 - 						107.4K

6.15.0-rc4badcommit - 					103.5K

6.15.0-rc4badcommit_m1 (one commit before bad-commit) - 94.2K

I also ran the micro-benchmark with tools/testing/perf record and 
following is the output from tools/testing/perf diff b/w the bad commit 
and just one commit before that.

# ./perf diff perf.data.old  perf.data
No kallsyms or vmlinux with build-id 
da8042fb274c5e3524318e5e3afbeeef5df2055e was found
# Event 'cycles:P'
#
# Baseline  Delta Abs  Shared Object            Symbol 
 
 
            >
# ........  .........  ....................... 
....................................................................................................................................................................................>
#
                +4.34%  [kernel.kallsyms]        [k] __lruvec_stat_mod_folio
                +3.41%  [kernel.kallsyms]        [k] unmap_page_range
                +3.33%  [kernel.kallsyms]        [k] 
__mod_memcg_lruvec_state
                +2.04%  [kernel.kallsyms]        [k] srso_alias_return_thunk
                +2.02%  [kernel.kallsyms]        [k] srso_alias_safe_ret
     22.22%     -1.78%  bw_mmap_rd               [.] bread
                +1.76%  [kernel.kallsyms]        [k] __handle_mm_fault
                +1.70%  [kernel.kallsyms]        [k] filemap_map_pages
                +1.58%  [kernel.kallsyms]        [k] set_pte_range
                +1.58%  [kernel.kallsyms]        [k] next_uptodate_folio
                +1.33%  [kernel.kallsyms]        [k] do_anonymous_page
                +1.01%  [kernel.kallsyms]        [k] get_page_from_freelist
                +0.98%  [kernel.kallsyms]        [k] __mem_cgroup_charge
                +0.85%  [kernel.kallsyms]        [k] asm_exc_page_fault
                +0.82%  [kernel.kallsyms]        [k] native_irq_return_iret
                +0.82%  [kernel.kallsyms]        [k] do_user_addr_fault
                +0.77%  [kernel.kallsyms]        [k] clear_page_erms
                +0.75%  [kernel.kallsyms]        [k] handle_mm_fault
                +0.73%  [kernel.kallsyms]        [k] set_ptes.isra.0
                +0.70%  [kernel.kallsyms]        [k] lru_add
                +0.69%  [kernel.kallsyms]        [k] 
folio_add_file_rmap_ptes
                +0.68%  [kernel.kallsyms]        [k] folio_remove_rmap_ptes
     12.45%     -0.65%  line                     [.] mem_benchmark_0
                +0.64%  [kernel.kallsyms]        [k] 
__alloc_frozen_pages_noprof
                +0.63%  [kernel.kallsyms]        [k] vm_normal_page
                +0.63%  [kernel.kallsyms]        [k] 
free_pages_and_swap_cache
                +0.63%  [kernel.kallsyms]        [k] lock_vma_under_rcu
                +0.60%  [kernel.kallsyms]        [k] __rcu_read_unlock
                +0.59%  [kernel.kallsyms]        [k] cgroup_rstat_updated
                +0.57%  [kernel.kallsyms]        [k] get_mem_cgroup_from_mm
                +0.52%  [kernel.kallsyms]        [k] __mod_lruvec_state
                +0.51%  [kernel.kallsyms]        [k] exc_page_fault

> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
>   fs/ext4/ext4.h      |  1 +
>   fs/ext4/ext4_jbd2.c |  3 ++-
>   fs/ext4/ialloc.c    |  3 +++
>   fs/ext4/inode.c     | 20 ++++++++++++++++++++
>   4 files changed, 26 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 5a20e9cd7184..2fad90c30493 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2993,6 +2993,7 @@ int ext4_walk_page_buffers(handle_t *handle,
>   				     struct buffer_head *bh));
>   int do_journal_get_write_access(handle_t *handle, struct inode *inode,
>   				struct buffer_head *bh);
> +bool ext4_should_enable_large_folio(struct inode *inode);
>   #define FALL_BACK_TO_NONDELALLOC 1
>   #define CONVERT_INLINE_DATA	 2
>   
> diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
> index 135e278c832e..b3e9b7bd7978 100644
> --- a/fs/ext4/ext4_jbd2.c
> +++ b/fs/ext4/ext4_jbd2.c
> @@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode)
>   	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
>   	    test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
>   	    (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
> -	    !test_opt(inode->i_sb, DELALLOC))) {
> +	    !test_opt(inode->i_sb, DELALLOC) &&
> +	    !mapping_large_folio_support(inode->i_mapping))) {
>   		/* We do not support data journalling for encrypted data */
>   		if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
>   			return EXT4_INODE_ORDERED_DATA_MODE;  /* ordered */
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index e7ecc7c8a729..4938e78cbadc 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -1336,6 +1336,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
>   		}
>   	}
>   
> +	if (ext4_should_enable_large_folio(inode))
> +		mapping_set_large_folios(inode->i_mapping);
> +
>   	ext4_update_inode_fsync_trans(handle, inode, 1);
>   
>   	err = ext4_mark_inode_dirty(handle, inode);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 29eccdf8315a..7fd3921cfe46 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4774,6 +4774,23 @@ static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
>   	return -EFSCORRUPTED;
>   }
>   
> +bool ext4_should_enable_large_folio(struct inode *inode)
> +{
> +	struct super_block *sb = inode->i_sb;
> +
> +	if (!S_ISREG(inode->i_mode))
> +		return false;
> +	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
> +	    ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
> +		return false;
> +	if (ext4_has_feature_verity(sb))
> +		return false;
> +	if (ext4_has_feature_encrypt(sb))
> +		return false;
> +
> +	return true;
> +}
> +
>   struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
>   			  ext4_iget_flags flags, const char *function,
>   			  unsigned int line)
> @@ -5096,6 +5113,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
>   		ret = -EFSCORRUPTED;
>   		goto bad_inode;
>   	}
> +	if (ext4_should_enable_large_folio(inode))
> +		mapping_set_large_folios(inode->i_mapping);
> +
>   	ret = check_igot_inode(inode, flags, function, line);
>   	/*
>   	 * -ESTALE here means there is nothing inherently wrong with the inode,

---
Thanks and Regards,
Suneeth D
Steps to run lmbench3

1. git clone https://github.com/intel/lmbench.git 
2. git clone https://github.com/intel/lkp-tests.git
3. cd lmbench
4. git apply lkp-tests/programs/lmbench3/pkg/lmbench3.patch
5. make
6. sed -i '/lat_pagefault -P  no/i [ -f no ] || dd if=/dev/zero of=no count=1 bs=1G' bin/x86_64-linux-gnu/lmbench
7. (
                echo 1
                echo 1
                echo 10240
                echo development

                echo no
                echo no
                echo no
                echo no
                echo no
                echo yes
                echo no
                echo no
                echo no
                echo no
                echo no
                echo no
                echo no
                echo no
                echo no
                echo no
                echo no

                echo yes
                echo
                echo
                echo
                [ 1 -eq 1 ] && echo
                echo no
        ) | make results
8. cd results/ && make
Zhang Yi June 26, 2025, 1:26 p.m. UTC | #7
Hello Suneeth D!

On 2025/6/26 19:29, D, Suneeth wrote:
> 
> Hello Zhang Yi,
> 
> On 5/12/2025 12:03 PM, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> Besides fsverity, fscrypt, and the data=journal mode, ext4 now supports
>> large folios for regular files. Enable this feature by default. However,
>> since we cannot change the folio order limitation of mappings on active
>> inodes, setting the journal=data mode via ioctl on an active inode will
>> not take immediate effect in non-delalloc mode.
>>
> 
> We run lmbench3 as part of our Weekly CI for the purpose of Kernel Performance Regression testing between a stable vs rc kernel. We noticed a regression on the kernels starting from 6.16-rc1 all the way through 6.16-rc3 in the range of 8-12%. Further bisection b/w 6.15 and 6.16-rc1 pointed me to the first bad commit as 7ac67301e82f02b77a5c8e7377a1f414ef108b84. The following were the machine configurations and test parameters used:-
> 
> Model name:           AMD EPYC 9754 128-Core Processor [Bergamo]
> Thread(s) per core:   2
> Core(s) per socket:   128
> Socket(s):            1
> Total online memory:  258G
> 
> micro-benchmark_variant: "lmbench3-development-1-0-MMAP-50%" which has the following parameters,
> 
> -> nr_thread:     1
> -> memory_size: 50%
> -> mode:     development
> -> test:        MMAP
> 
> The following are the stats after bisection:-
> 
> (the KPI used here is lmbench3.MMAP.read.latency.us)
> 
> v6.15 -                         97.3K
> 
> v6.16-rc1 -                         107.5K
> 
> v6.16-rc3 -                         107.4K
> 
> 6.15.0-rc4badcommit -                     103.5K
> 
> 6.15.0-rc4badcommit_m1 (one commit before bad-commit) - 94.2K

Thanks for the report, I will try to reproduce this performance regression on
my machine and find out what caused this regression.

Thanks,
Yi.

> 
> I also ran the micro-benchmark with tools/testing/perf record and following is the output from tools/testing/perf diff b/w the bad commit and just one commit before that.
> 
> # ./perf diff perf.data.old  perf.data
> No kallsyms or vmlinux with build-id da8042fb274c5e3524318e5e3afbeeef5df2055e was found
> # Event 'cycles:P'
> #
> # Baseline  Delta Abs  Shared Object            Symbol
> 
>            >
> # ........  .........  ....................... ....................................................................................................................................................................................>
> #
>                +4.34%  [kernel.kallsyms]        [k] __lruvec_stat_mod_folio
>                +3.41%  [kernel.kallsyms]        [k] unmap_page_range
>                +3.33%  [kernel.kallsyms]        [k] __mod_memcg_lruvec_state
>                +2.04%  [kernel.kallsyms]        [k] srso_alias_return_thunk
>                +2.02%  [kernel.kallsyms]        [k] srso_alias_safe_ret
>     22.22%     -1.78%  bw_mmap_rd               [.] bread
>                +1.76%  [kernel.kallsyms]        [k] __handle_mm_fault
>                +1.70%  [kernel.kallsyms]        [k] filemap_map_pages
>                +1.58%  [kernel.kallsyms]        [k] set_pte_range
>                +1.58%  [kernel.kallsyms]        [k] next_uptodate_folio
>                +1.33%  [kernel.kallsyms]        [k] do_anonymous_page
>                +1.01%  [kernel.kallsyms]        [k] get_page_from_freelist
>                +0.98%  [kernel.kallsyms]        [k] __mem_cgroup_charge
>                +0.85%  [kernel.kallsyms]        [k] asm_exc_page_fault
>                +0.82%  [kernel.kallsyms]        [k] native_irq_return_iret
>                +0.82%  [kernel.kallsyms]        [k] do_user_addr_fault
>                +0.77%  [kernel.kallsyms]        [k] clear_page_erms
>                +0.75%  [kernel.kallsyms]        [k] handle_mm_fault
>                +0.73%  [kernel.kallsyms]        [k] set_ptes.isra.0
>                +0.70%  [kernel.kallsyms]        [k] lru_add
>                +0.69%  [kernel.kallsyms]        [k] folio_add_file_rmap_ptes
>                +0.68%  [kernel.kallsyms]        [k] folio_remove_rmap_ptes
>     12.45%     -0.65%  line                     [.] mem_benchmark_0
>                +0.64%  [kernel.kallsyms]        [k] __alloc_frozen_pages_noprof
>                +0.63%  [kernel.kallsyms]        [k] vm_normal_page
>                +0.63%  [kernel.kallsyms]        [k] free_pages_and_swap_cache
>                +0.63%  [kernel.kallsyms]        [k] lock_vma_under_rcu
>                +0.60%  [kernel.kallsyms]        [k] __rcu_read_unlock
>                +0.59%  [kernel.kallsyms]        [k] cgroup_rstat_updated
>                +0.57%  [kernel.kallsyms]        [k] get_mem_cgroup_from_mm
>                +0.52%  [kernel.kallsyms]        [k] __mod_lruvec_state
>                +0.51%  [kernel.kallsyms]        [k] exc_page_fault
> 
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>> ---
>>   fs/ext4/ext4.h      |  1 +
>>   fs/ext4/ext4_jbd2.c |  3 ++-
>>   fs/ext4/ialloc.c    |  3 +++
>>   fs/ext4/inode.c     | 20 ++++++++++++++++++++
>>   4 files changed, 26 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 5a20e9cd7184..2fad90c30493 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -2993,6 +2993,7 @@ int ext4_walk_page_buffers(handle_t *handle,
>>                        struct buffer_head *bh));
>>   int do_journal_get_write_access(handle_t *handle, struct inode *inode,
>>                   struct buffer_head *bh);
>> +bool ext4_should_enable_large_folio(struct inode *inode);
>>   #define FALL_BACK_TO_NONDELALLOC 1
>>   #define CONVERT_INLINE_DATA     2
>>   diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
>> index 135e278c832e..b3e9b7bd7978 100644
>> --- a/fs/ext4/ext4_jbd2.c
>> +++ b/fs/ext4/ext4_jbd2.c
>> @@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode)
>>           ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
>>           test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
>>           (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
>> -        !test_opt(inode->i_sb, DELALLOC))) {
>> +        !test_opt(inode->i_sb, DELALLOC) &&
>> +        !mapping_large_folio_support(inode->i_mapping))) {
>>           /* We do not support data journalling for encrypted data */
>>           if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
>>               return EXT4_INODE_ORDERED_DATA_MODE;  /* ordered */
>> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
>> index e7ecc7c8a729..4938e78cbadc 100644
>> --- a/fs/ext4/ialloc.c
>> +++ b/fs/ext4/ialloc.c
>> @@ -1336,6 +1336,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
>>           }
>>       }
>>   +    if (ext4_should_enable_large_folio(inode))
>> +        mapping_set_large_folios(inode->i_mapping);
>> +
>>       ext4_update_inode_fsync_trans(handle, inode, 1);
>>         err = ext4_mark_inode_dirty(handle, inode);
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 29eccdf8315a..7fd3921cfe46 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -4774,6 +4774,23 @@ static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
>>       return -EFSCORRUPTED;
>>   }
>>   +bool ext4_should_enable_large_folio(struct inode *inode)
>> +{
>> +    struct super_block *sb = inode->i_sb;
>> +
>> +    if (!S_ISREG(inode->i_mode))
>> +        return false;
>> +    if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
>> +        ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
>> +        return false;
>> +    if (ext4_has_feature_verity(sb))
>> +        return false;
>> +    if (ext4_has_feature_encrypt(sb))
>> +        return false;
>> +
>> +    return true;
>> +}
>> +
>>   struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
>>                 ext4_iget_flags flags, const char *function,
>>                 unsigned int line)
>> @@ -5096,6 +5113,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
>>           ret = -EFSCORRUPTED;
>>           goto bad_inode;
>>       }
>> +    if (ext4_should_enable_large_folio(inode))
>> +        mapping_set_large_folios(inode->i_mapping);
>> +
>>       ret = check_igot_inode(inode, flags, function, line);
>>       /*
>>        * -ESTALE here means there is nothing inherently wrong with the inode,
> 
> ---
> Thanks and Regards,
> Suneeth D
Theodore Ts'o June 26, 2025, 2:56 p.m. UTC | #8
On Thu, Jun 26, 2025 at 09:26:41PM +0800, Zhang Yi wrote:
> 
> Thanks for the report, I will try to reproduce this performance regression on
> my machine and find out what caused this regression.

I took a quick look at this, and I *think* it's because lmbench is
measuring the latency of mmap read's --- I'm going to guess 4k random
page faults, but I'm not sure.  If that's the case, this may just be a
natural result of using large folios, and the tradeoff of optimizing
for large reads versus small page faults.

But if you could take a closer look, that would be great, thanks!

						- Ted
Zhang Yi July 3, 2025, 2:13 p.m. UTC | #9
On 2025/6/26 22:56, Theodore Ts'o wrote:
> On Thu, Jun 26, 2025 at 09:26:41PM +0800, Zhang Yi wrote:
>>
>> Thanks for the report, I will try to reproduce this performance regression on
>> my machine and find out what caused this regression.
> 
> I took a quick look at this, and I *think* it's because lmbench is
> measuring the latency of mmap read's --- I'm going to guess 4k random
> page faults, but I'm not sure.  If that's the case, this may just be a
> natural result of using large folios, and the tradeoff of optimizing
> for large reads versus small page faults.
> 
> But if you could take a closer look, that would be great, thanks!
> 

After analyzing what the lmbench mmap test actually does, I found that
the regression is related to the mmap writes, not mmap reads. In other
words, the latency increases in ext4_page_mkwrite() after we enable
large folios.

The lmbench mmap test performed the following two tests:
1. mmap a range with PROT_READ|PROT_WRITE and MAP_SHARED, and then
   write one byte every 16KB sequentially.
2. mmap a range with PROT_READ and MAP_SHARED, and then read byte
   one by one sequentially.

For the mmap read test, the average page fault latency on my machine
can be improved from 3,634 ns to 2,005 ns. This improvement is due to
the ability to save the folio readahead loop in page_cache_async_ra()
and the set PTE loop in filemap_map_pages() after implementing support
for large folios.

For the mmap write test, the number of page faults does not decrease
due to the large folio (the maximum order is 5), each page still
incurs one page fault. However, the ext4_page_mkwrite() does multiple
iterations through buffer_head in the folio, so the time consumption
will increase. The latency of ext4_page_mkwrite() can be increased
from 958ns to 1596ns.

After looking at the comments in finish_fault() and 43e027e414232
("mm: memory: extend finish_fault() to support large folio").

vm_fault_t finish_fault(struct vm_fault *vmf)
{
	...
	nr_pages = folio_nr_pages(folio);

	/*
	 * Using per-page fault to maintain the uffd semantics, and same
	 * approach also applies to non-anonymous-shmem faults to avoid
	 * inflating the RSS of the process.
	 */
	if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
	    unlikely(needs_fallback)) {
		nr_pages = 1;
	...
	set_pte_range(vmf, folio, page, nr_pages, addr);
}

I believe this regression can be resolved if the finish_fault()
supports file-based large folios, but I'm not sure if we are planning
to implement this.

As for ext4_page_mkwrite(), I think it can also be optimized by reducing
the number of the folio iterations, but this would make it impossible to
use existing generic helpers and could make the code very messy.

Best regards,
Yi.
diff mbox series

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5a20e9cd7184..2fad90c30493 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2993,6 +2993,7 @@  int ext4_walk_page_buffers(handle_t *handle,
 				     struct buffer_head *bh));
 int do_journal_get_write_access(handle_t *handle, struct inode *inode,
 				struct buffer_head *bh);
+bool ext4_should_enable_large_folio(struct inode *inode);
 #define FALL_BACK_TO_NONDELALLOC 1
 #define CONVERT_INLINE_DATA	 2
 
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 135e278c832e..b3e9b7bd7978 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -16,7 +16,8 @@  int ext4_inode_journal_mode(struct inode *inode)
 	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
 	    test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
 	    (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
-	    !test_opt(inode->i_sb, DELALLOC))) {
+	    !test_opt(inode->i_sb, DELALLOC) &&
+	    !mapping_large_folio_support(inode->i_mapping))) {
 		/* We do not support data journalling for encrypted data */
 		if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
 			return EXT4_INODE_ORDERED_DATA_MODE;  /* ordered */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e7ecc7c8a729..4938e78cbadc 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1336,6 +1336,9 @@  struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
 		}
 	}
 
+	if (ext4_should_enable_large_folio(inode))
+		mapping_set_large_folios(inode->i_mapping);
+
 	ext4_update_inode_fsync_trans(handle, inode, 1);
 
 	err = ext4_mark_inode_dirty(handle, inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 29eccdf8315a..7fd3921cfe46 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4774,6 +4774,23 @@  static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
 	return -EFSCORRUPTED;
 }
 
+bool ext4_should_enable_large_folio(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (!S_ISREG(inode->i_mode))
+		return false;
+	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+	    ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+		return false;
+	if (ext4_has_feature_verity(sb))
+		return false;
+	if (ext4_has_feature_encrypt(sb))
+		return false;
+
+	return true;
+}
+
 struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 			  ext4_iget_flags flags, const char *function,
 			  unsigned int line)
@@ -5096,6 +5113,9 @@  struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 		ret = -EFSCORRUPTED;
 		goto bad_inode;
 	}
+	if (ext4_should_enable_large_folio(inode))
+		mapping_set_large_folios(inode->i_mapping);
+
 	ret = check_igot_inode(inode, flags, function, line);
 	/*
 	 * -ESTALE here means there is nothing inherently wrong with the inode,