Message ID | 20220402084023.1841375-1-anserper@ya.ru |
---|---|
State | New |
Headers | show |
Series | [v3] ext4: truncate during setxattr leads to kernel panic | expand |
On Apr 2, 2022, at 2:40 AM, anserper@ya.ru wrote: > > From: Andrew Perepechko <andrew.perepechko@hpe.com> > > When changing a large xattr value to a different large xattr value, > the old xattr inode is freed. Truncate during the final iput causes > current transaction restart. Eventually, parent inode bh is marked > dirty and kernel panic happens when jbd2 figures out that this bh > belongs to the committed transaction. > > A possible fix is to call this final iput in a separate thread. > This way, setxattr transactions will never be split into two. > Since the setxattr code adds xattr inodes with nlink=0 into the > orphan list, old xattr inodes will be properly cleaned up in > any case. > > Signed-off-by: Andrew Perepechko <andrew.perepechko@hpe.com> Reviewed-by: Andreas Dilger <adilger@dilger.ca> > HPE-bug-id: LUS-10534 > > Changes since v1: > - fixed a bug added during the porting > - fixed a workqueue related deadlock reported by Tetsuo Handa > --- > fs/ext4/ext4.h | 7 +++++-- > fs/ext4/page-io.c | 2 +- > fs/ext4/super.c | 15 ++++++++------- > fs/ext4/xattr.c | 39 +++++++++++++++++++++++++++++++++++++-- > 4 files changed, 51 insertions(+), 12 deletions(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 3f87cca49f0c..52db5d6bae7f 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -1650,8 +1650,11 @@ struct ext4_sb_info { > struct flex_groups * __rcu *s_flex_groups; > ext4_group_t s_flex_groups_allocated; > > - /* workqueue for reserved extent conversions (buffered io) */ > - struct workqueue_struct *rsv_conversion_wq; > + /* > + * workqueue for reserved extent conversions (buffered io) > + * and large ea inodes reclaim > + */ > + struct workqueue_struct *s_misc_wq; > > /* timer for periodic error stats printing */ > struct timer_list s_err_report; > diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c > index 495ce59fb4ad..0142b88471ff 100644 > --- a/fs/ext4/page-io.c > +++ b/fs/ext4/page-io.c > @@ -228,7 +228,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) > WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); > WARN_ON(!io_end->handle && sbi->s_journal); > spin_lock_irqsave(&ei->i_completed_io_lock, flags); > - wq = sbi->rsv_conversion_wq; > + wq = sbi->s_misc_wq; > if (list_empty(&ei->i_rsv_conversion_list)) > queue_work(wq, &ei->i_rsv_conversion_work); > list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); > diff --git a/fs/ext4/super.c b/fs/ext4/super.c > index 81749eaddf4c..ee03f593b264 100644 > --- a/fs/ext4/super.c > +++ b/fs/ext4/super.c > @@ -1200,10 +1200,11 @@ static void ext4_put_super(struct super_block *sb) > int i, err; > > ext4_unregister_li_request(sb); > + flush_workqueue(sbi->s_misc_wq); > ext4_quota_off_umount(sb); > > flush_work(&sbi->s_error_work); > - destroy_workqueue(sbi->rsv_conversion_wq); > + destroy_workqueue(sbi->s_misc_wq); > ext4_release_orphan_info(sb); > > /* > @@ -5294,9 +5295,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) > * The maximum number of concurrent works can be high and > * concurrency isn't really necessary. Limit it to 1. > */ > - EXT4_SB(sb)->rsv_conversion_wq = > - alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); > - if (!EXT4_SB(sb)->rsv_conversion_wq) { > + EXT4_SB(sb)->s_misc_wq = > + alloc_workqueue("ext4-misc", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); > + if (!EXT4_SB(sb)->s_misc_wq) { > printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); > ret = -ENOMEM; > goto failed_mount4; > @@ -5514,8 +5515,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) > sb->s_root = NULL; > failed_mount4: > ext4_msg(sb, KERN_ERR, "mount failed"); > - if (EXT4_SB(sb)->rsv_conversion_wq) > - destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); > + if (EXT4_SB(sb)->s_misc_wq) > + destroy_workqueue(EXT4_SB(sb)->s_misc_wq); > failed_mount_wq: > ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); > sbi->s_ea_inode_cache = NULL; > @@ -6129,7 +6130,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) > return 0; > > trace_ext4_sync_fs(sb, wait); > - flush_workqueue(sbi->rsv_conversion_wq); > + flush_workqueue(sbi->s_misc_wq); > /* > * Writeback quota in non-journalled quota case - journalled quota has > * no dirty dquots > diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c > index 042325349098..ee13675fbead 100644 > --- a/fs/ext4/xattr.c > +++ b/fs/ext4/xattr.c > @@ -1544,6 +1544,36 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, > return 0; > } > > +struct delayed_iput_work { > + struct work_struct work; > + struct inode *inode; > +}; > + > +static void delayed_iput_fn(struct work_struct *work) > +{ > + struct delayed_iput_work *diwork; > + > + diwork = container_of(work, struct delayed_iput_work, work); > + iput(diwork->inode); > + kfree(diwork); > +} > + > +static void delayed_iput(struct inode *inode, struct delayed_iput_work *work) > +{ > + if (!inode) { > + kfree(work); > + return; > + } > + > + if (!work) { > + iput(inode); > + } else { > + INIT_WORK(&work->work, delayed_iput_fn); > + work->inode = inode; > + queue_work(EXT4_SB(inode->i_sb)->s_misc_wq, &work->work); > + } > +} > + > /* > * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode > * feature is enabled. > @@ -1561,6 +1591,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, > int in_inode = i->in_inode; > struct inode *old_ea_inode = NULL; > struct inode *new_ea_inode = NULL; > + struct delayed_iput_work *diwork = NULL; > size_t old_size, new_size; > int ret; > > @@ -1637,7 +1668,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, > * Finish that work before doing any modifications to the xattr data. > */ > if (!s->not_found && here->e_value_inum) { > - ret = ext4_xattr_inode_iget(inode, > + diwork = kmalloc(sizeof(*diwork), GFP_NOFS); > + if (!diwork) > + ret = -ENOMEM; > + else > + ret = ext4_xattr_inode_iget(inode, > le32_to_cpu(here->e_value_inum), > le32_to_cpu(here->e_hash), > &old_ea_inode); > @@ -1790,7 +1825,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, > > ret = 0; > out: > - iput(old_ea_inode); > + delayed_iput(old_ea_inode, diwork); > iput(new_ea_inode); > return ret; > } > -- > 2.25.1 > Cheers, Andreas
On 22/04/02 11:40AM, anserper@ya.ru wrote: > From: Andrew Perepechko <andrew.perepechko@hpe.com> > > When changing a large xattr value to a different large xattr value, > the old xattr inode is freed. Truncate during the final iput causes > current transaction restart. Eventually, parent inode bh is marked > dirty and kernel panic happens when jbd2 figures out that this bh > belongs to the committed transaction. > > A possible fix is to call this final iput in a separate thread. > This way, setxattr transactions will never be split into two. > Since the setxattr code adds xattr inodes with nlink=0 into the > orphan list, old xattr inodes will be properly cleaned up in > any case. Ok, I think there is a lot happening in above description. I think part of the problem I am unable to understand it easily is because I haven't spend much time with xattr code. But I think below 2 requests will be good to have - 1. Do we have the call stack for this problem handy. I think it will be good to mention it in the commit message itself. It is sometimes easy to look at the call stack if someone else encounters a similar problem. That also gives more idea about where the problem is occuring. 2. Do we have a easy reproducer for this problem? I think it will be a good addition to fstests given that this adds another context in calling iput on old_ea_inode. > > Signed-off-by: Andrew Perepechko <andrew.perepechko@hpe.com> > HPE-bug-id: LUS-10534 ^^^ I think above can be dropped. Any fixes tag instead? -ritesh
Reviewed-by: Artem Blagodarenko <artem.blagodarenko@gmail.com> > 2 апр. 2022 г., в 11:40, anserper@ya.ru написал(а): > > From: Andrew Perepechko <andrew.perepechko@hpe.com> > > When changing a large xattr value to a different large xattr value, > the old xattr inode is freed. Truncate during the final iput causes > current transaction restart. Eventually, parent inode bh is marked > dirty and kernel panic happens when jbd2 figures out that this bh > belongs to the committed transaction. > > A possible fix is to call this final iput in a separate thread. > This way, setxattr transactions will never be split into two. > Since the setxattr code adds xattr inodes with nlink=0 into the > orphan list, old xattr inodes will be properly cleaned up in > any case. > > Signed-off-by: Andrew Perepechko <andrew.perepechko@hpe.com> > HPE-bug-id: LUS-10534 > > Changes since v1: > - fixed a bug added during the porting > - fixed a workqueue related deadlock reported by Tetsuo Handa > --- > fs/ext4/ext4.h | 7 +++++-- > fs/ext4/page-io.c | 2 +- > fs/ext4/super.c | 15 ++++++++------- > fs/ext4/xattr.c | 39 +++++++++++++++++++++++++++++++++++++-- > 4 files changed, 51 insertions(+), 12 deletions(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 3f87cca49f0c..52db5d6bae7f 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -1650,8 +1650,11 @@ struct ext4_sb_info { > struct flex_groups * __rcu *s_flex_groups; > ext4_group_t s_flex_groups_allocated; > > - /* workqueue for reserved extent conversions (buffered io) */ > - struct workqueue_struct *rsv_conversion_wq; > + /* > + * workqueue for reserved extent conversions (buffered io) > + * and large ea inodes reclaim > + */ > + struct workqueue_struct *s_misc_wq; > > /* timer for periodic error stats printing */ > struct timer_list s_err_report; > diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c > index 495ce59fb4ad..0142b88471ff 100644 > --- a/fs/ext4/page-io.c > +++ b/fs/ext4/page-io.c > @@ -228,7 +228,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) > WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); > WARN_ON(!io_end->handle && sbi->s_journal); > spin_lock_irqsave(&ei->i_completed_io_lock, flags); > - wq = sbi->rsv_conversion_wq; > + wq = sbi->s_misc_wq; > if (list_empty(&ei->i_rsv_conversion_list)) > queue_work(wq, &ei->i_rsv_conversion_work); > list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); > diff --git a/fs/ext4/super.c b/fs/ext4/super.c > index 81749eaddf4c..ee03f593b264 100644 > --- a/fs/ext4/super.c > +++ b/fs/ext4/super.c > @@ -1200,10 +1200,11 @@ static void ext4_put_super(struct super_block *sb) > int i, err; > > ext4_unregister_li_request(sb); > + flush_workqueue(sbi->s_misc_wq); > ext4_quota_off_umount(sb); > > flush_work(&sbi->s_error_work); > - destroy_workqueue(sbi->rsv_conversion_wq); > + destroy_workqueue(sbi->s_misc_wq); > ext4_release_orphan_info(sb); > > /* > @@ -5294,9 +5295,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) > * The maximum number of concurrent works can be high and > * concurrency isn't really necessary. Limit it to 1. > */ > - EXT4_SB(sb)->rsv_conversion_wq = > - alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); > - if (!EXT4_SB(sb)->rsv_conversion_wq) { > + EXT4_SB(sb)->s_misc_wq = > + alloc_workqueue("ext4-misc", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); > + if (!EXT4_SB(sb)->s_misc_wq) { > printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); > ret = -ENOMEM; > goto failed_mount4; > @@ -5514,8 +5515,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) > sb->s_root = NULL; > failed_mount4: > ext4_msg(sb, KERN_ERR, "mount failed"); > - if (EXT4_SB(sb)->rsv_conversion_wq) > - destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); > + if (EXT4_SB(sb)->s_misc_wq) > + destroy_workqueue(EXT4_SB(sb)->s_misc_wq); > failed_mount_wq: > ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); > sbi->s_ea_inode_cache = NULL; > @@ -6129,7 +6130,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) > return 0; > > trace_ext4_sync_fs(sb, wait); > - flush_workqueue(sbi->rsv_conversion_wq); > + flush_workqueue(sbi->s_misc_wq); > /* > * Writeback quota in non-journalled quota case - journalled quota has > * no dirty dquots > diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c > index 042325349098..ee13675fbead 100644 > --- a/fs/ext4/xattr.c > +++ b/fs/ext4/xattr.c > @@ -1544,6 +1544,36 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, > return 0; > } > > +struct delayed_iput_work { > + struct work_struct work; > + struct inode *inode; > +}; > + > +static void delayed_iput_fn(struct work_struct *work) > +{ > + struct delayed_iput_work *diwork; > + > + diwork = container_of(work, struct delayed_iput_work, work); > + iput(diwork->inode); > + kfree(diwork); > +} > + > +static void delayed_iput(struct inode *inode, struct delayed_iput_work *work) > +{ > + if (!inode) { > + kfree(work); > + return; > + } > + > + if (!work) { > + iput(inode); > + } else { > + INIT_WORK(&work->work, delayed_iput_fn); > + work->inode = inode; > + queue_work(EXT4_SB(inode->i_sb)->s_misc_wq, &work->work); > + } > +} > + > /* > * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode > * feature is enabled. > @@ -1561,6 +1591,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, > int in_inode = i->in_inode; > struct inode *old_ea_inode = NULL; > struct inode *new_ea_inode = NULL; > + struct delayed_iput_work *diwork = NULL; > size_t old_size, new_size; > int ret; > > @@ -1637,7 +1668,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, > * Finish that work before doing any modifications to the xattr data. > */ > if (!s->not_found && here->e_value_inum) { > - ret = ext4_xattr_inode_iget(inode, > + diwork = kmalloc(sizeof(*diwork), GFP_NOFS); > + if (!diwork) > + ret = -ENOMEM; > + else > + ret = ext4_xattr_inode_iget(inode, > le32_to_cpu(here->e_value_inum), > le32_to_cpu(here->e_hash), > &old_ea_inode); > @@ -1790,7 +1825,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, > > ret = 0; > out: > - iput(old_ea_inode); > + delayed_iput(old_ea_inode, diwork); > iput(new_ea_inode); > return ret; > } > -- > 2.25.1 >
On Tue, Apr 05, 2022 at 03:24:51PM +0530, Ritesh Harjani wrote: > On 22/04/02 11:40AM, anserper@ya.ru wrote: > > From: Andrew Perepechko <andrew.perepechko@hpe.com> > > > > When changing a large xattr value to a different large xattr value, > > the old xattr inode is freed. Truncate during the final iput causes > > current transaction restart. Eventually, parent inode bh is marked > > dirty and kernel panic happens when jbd2 figures out that this bh > > belongs to the committed transaction. > > > > A possible fix is to call this final iput in a separate thread. > > This way, setxattr transactions will never be split into two. > > Since the setxattr code adds xattr inodes with nlink=0 into the > > orphan list, old xattr inodes will be properly cleaned up in > > any case. > > Ok, I think there is a lot happening in above description. I think part of the > problem I am unable to understand it easily is because I haven't spend much time > with xattr code. But I think below 2 requests will be good to have - > > 1. Do we have the call stack for this problem handy. I think it will be good to > mention it in the commit message itself. It is sometimes easy to look at the > call stack if someone else encounters a similar problem. That also gives more > idea about where the problem is occuring. > > 2. Do we have a easy reproducer for this problem? I think it will be a good > addition to fstests given that this adds another context in calling iput on > old_ea_inode. Andrew, would it be possible for you to supply a call stack and a reproducer? It sounds like what's going on is if the file system has the ea_inode feature enabled, and we have a large xattr value which is stored in an inode, it's possible if that when that inode is truncated, it is spread across two transactions. But the problem is that when the iput(ea_inode) is called from ext4_xattr_set_entry(), there is a handle which is passed into that function, since the xattr operation is part of its own transaction, and so the truncate operation is part of "nested handle". That's OK, so long as the initial handle is started with sufficient credits for the nested start_handle. But when that handle is closed, and then re-opened, it has two problems. The first is that the xattr operation is no longer atomic (and spread across two transaction). The second is that if the write access to the inode table's bh was requested before the implied truncate from iput(ea_inode), then when we call handle_dirty_metadata() on that bh, we get a jbd2 assertion. (Which is good, because it notifies and catches the first problem.) So by moving the iput to a separate thread, it avoids this problem, since the truncate can take place in its own handle. The other solution would be to some how pass the inode all the way up through the call chain, and only call iput(ea_inode) after handle is stopped. But that would require quite a lot of code surgery, since ext4_xattr_set_entry is called in a number of places, and the iput() would have to be plumbed up through two callers to where the handle is actually stopped. - Ted
Hello Theodore and Ritesh! Thank you for your feedback. Please, let me give you some details. First of all, the original issue was found with Lustre as the primary user of ext4. In this particular use case, Lustre is very similar to a userspace ext4 user except that it has its own transaction logic that covers setxattr and some other operation (so it's even harder to fix the issue by pushing iput from under transaction). When the root cause was identified, I was able to reproduce the bug without Lustre with the following script: dd if=/dev/zero of=/tmp/ldiskfs bs=1M count=100 mkfs.ext4 -O ea_inode /tmp/ldiskfs -J size=16 -I 512 mkdir -p /tmp/ldiskfs_m mount -t ext4 /tmp/ldiskfs /tmp/ldiskfs_m -o loop,commit=600,no_mbcache touch /tmp/ldiskfs_m/file{1..1024} V=$(for i in `seq 60000`; do echo -n x ; done) V1="1$V" V2="2$V" for k in 1 2 3 4 5 6 7 8 9; do setfattr -n user.xattr -v $V /tmp/ldiskfs_m/file{1..1024} setfattr -n user.xattr -v $V1 /tmp/ldiskfs_m/file{1..1024} & setfattr -n user.xattr -v $V2 /tmp/ldiskfs_m/file{1024..1} & wait done umount /tmp/ldiskfs_m Please note that the reproducer does not really depend on sizes or options such as commit=600 or no_mbcache. However, no_mbcache allows the reproducer to use a small set of large ea values for the race. With mbcache, large EA inodes can be aggressively reused. The above reproducer triggers the following oops (using a build from a recent linux.git commit): [ 181.269541] ------------[ cut here ]------------ [ 181.269733] kernel BUG at fs/jbd2/transaction.c:1511! [ 181.269951] invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI [ 181.270169] CPU: 0 PID: 940 Comm: setfattr Not tainted 5.17.0-13430-g787af64d05cd #9 [ 181.270243] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 181.270243] RIP: 0010:jbd2_journal_dirty_metadata+0x400/0x420 [ 181.270243] Code: 24 4c 4c 8b 0c 24 41 83 f8 01 0f 84 3c ff ff ff e9 24 94 0b 01 48 8b 7c 24 08 e8 cb f6 df ff 4d 39 6c 24 70 0f 84 e6 fc ff ff <0f> 0b 0f 0b c7 4 4 24 18 e4 ff ff ff e9 9f fe ff ff 0f 0b c7 44 24 [ 181.270243] RSP: 0018:ffff88802632f698 EFLAGS: 00010207 [ 181.270243] RAX: 0000000000000000 RBX: ffff88800044fcc8 RCX: ffffffffa73a03b5 [ 181.270243] RDX: dffffc0000000000 RSI: 0000000000000004 RDI: ffff888032ff6f58 [ 181.270243] RBP: ffff88802eb04418 R08: ffffffffa6f502bf R09: ffffed1004c65ec6 [ 181.270243] R10: 0000000000000003 R11: ffffed1004c65ec5 R12: ffff888032ff6ee8 [ 181.270243] R13: ffff888024af8300 R14: ffff88800044fcec R15: ffff888032ff6f50 [ 181.270243] FS: 00007ffb5f6e3740(0000) GS:ffff888034800000(0000) knlGS:0000000000000000 [ 181.270243] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 181.270243] CR2: 00007f9ca09dc024 CR3: 0000000032244000 CR4: 00000000000506f0 [ 181.270243] Call Trace: [ 181.270243] <TASK> [ 181.270243] __ext4_handle_dirty_metadata+0xb1/0x330 [ 181.270243] ext4_mark_iloc_dirty+0x2b7/0xcd0 [ 181.270243] ext4_xattr_set_handle+0x694/0xaf0 [ 181.270243] ext4_xattr_set+0x164/0x260 [ 181.270243] __vfs_setxattr+0xcb/0x110 [ 181.270243] __vfs_setxattr_noperm+0x8c/0x300 [ 181.270243] vfs_setxattr+0xff/0x250 [ 181.270243] setxattr+0x14a/0x260 [ 181.270243] path_setxattr+0x132/0x150 [ 181.270243] __x64_sys_setxattr+0x63/0x70 [ 181.270243] do_syscall_64+0x3b/0x90 [ 181.270243] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 181.270243] RIP: 0033:0x7ffb5efebbee This oops is caused by the fact that the inode bh was initially added to another transaction. However, when it's being marked dirty, the running transaction has already changed because of transaction restart in truncate. Unfortunately, it's not the only issue here (so it cannot be fixed by simply re-adding bh to the new transaction) since transaction restart forced the earlier transaction to commit and the kernel panic left the running transaction uncommitted, on-disk corruption happens, kernel panic will happen after remount if the corrupted inode/large EA is accessed again. An example of a panic due to ondisk corruption after remount from a Lustre system: [203570.602463] LDISKFS-fs error (device md65): ldiskfs_xattr_inode_iget:392: comm mdt04_008: error while reading EA inode 807598115 err=-116 [203570.617935] Aborting journal on device md65-8. [203570.628176] Kernel panic - not syncing: LDISKFS-fs (device md65): panic forced after error This whole text is probably an overkill for a commit message. What do you think I should keep for the commit message? Thank you, Andrew > On Tue, Apr 05, 2022 at 03:24:51PM +0530, Ritesh Harjani wrote: >> On 22/04/02 11:40AM, anserper@ya.ru wrote: >>> From: Andrew Perepechko <andrew.perepechko@hpe.com> >>> >>> When changing a large xattr value to a different large xattr value, >>> the old xattr inode is freed. Truncate during the final iput causes >>> current transaction restart. Eventually, parent inode bh is marked >>> dirty and kernel panic happens when jbd2 figures out that this bh >>> belongs to the committed transaction. >>> >>> A possible fix is to call this final iput in a separate thread. >>> This way, setxattr transactions will never be split into two. >>> Since the setxattr code adds xattr inodes with nlink=0 into the >>> orphan list, old xattr inodes will be properly cleaned up in >>> any case. >> Ok, I think there is a lot happening in above description. I think part of the >> problem I am unable to understand it easily is because I haven't spend much time >> with xattr code. But I think below 2 requests will be good to have - >> >> 1. Do we have the call stack for this problem handy. I think it will be good to >> mention it in the commit message itself. It is sometimes easy to look at the >> call stack if someone else encounters a similar problem. That also gives more >> idea about where the problem is occuring. >> >> 2. Do we have a easy reproducer for this problem? I think it will be a good >> addition to fstests given that this adds another context in calling iput on >> old_ea_inode. > Andrew, would it be possible for you to supply a call stack and a > reproducer? > > It sounds like what's going on is if the file system has the ea_inode > feature enabled, and we have a large xattr value which is stored in an > inode, it's possible if that when that inode is truncated, it is > spread across two transactions. > > But the problem is that when the iput(ea_inode) is called from > ext4_xattr_set_entry(), there is a handle which is passed into that > function, since the xattr operation is part of its own transaction, > and so the truncate operation is part of "nested handle". That's OK, > so long as the initial handle is started with sufficient credits for > the nested start_handle. But when that handle is closed, and then > re-opened, it has two problems. The first is that the xattr operation > is no longer atomic (and spread across two transaction). The second > is that if the write access to the inode table's bh was requested > before the implied truncate from iput(ea_inode), then when we call > handle_dirty_metadata() on that bh, we get a jbd2 assertion. (Which > is good, because it notifies and catches the first problem.) > > So by moving the iput to a separate thread, it avoids this problem, > since the truncate can take place in its own handle. The other > solution would be to some how pass the inode all the way up through > the call chain, and only call iput(ea_inode) after handle is stopped. > But that would require quite a lot of code surgery, since > ext4_xattr_set_entry is called in a number of places, and the iput() > would have to be plumbed up through two callers to where the handle is > actually stopped. > > - Ted
On Thu, Apr 07, 2022 at 09:25:40PM +0300, Andrew wrote: > > dd if=/dev/zero of=/tmp/ldiskfs bs=1M count=100 > mkfs.ext4 -O ea_inode /tmp/ldiskfs -J size=16 -I 512 > > mkdir -p /tmp/ldiskfs_m > mount -t ext4 /tmp/ldiskfs /tmp/ldiskfs_m -o loop,commit=600,no_mbcache > touch /tmp/ldiskfs_m/file{1..1024} > > V=$(for i in `seq 60000`; do echo -n x ; done) > V1="1$V" > V2="2$V" > > for k in 1 2 3 4 5 6 7 8 9; do > setfattr -n user.xattr -v $V /tmp/ldiskfs_m/file{1..1024} > setfattr -n user.xattr -v $V1 /tmp/ldiskfs_m/file{1..1024} & > setfattr -n user.xattr -v $V2 /tmp/ldiskfs_m/file{1024..1} & > wait > done > > umount /tmp/ldiskfs_m Hi Andrew, Thanks for the reproducer. I'll note that with the proposed patch, we will allocate a *large* number of delayed_iput_work structure, and most of the time, it is not necessary to do a delayed_iput, since we won't actually be releasing the ea_inode. That only happens when the ea_inode's refcount goes to zero. So we could significantly reduce the overhead of this patch by plumbing whether the refcount went to zero in ext4_xattr_inode_update_ref() up through ext4_xattr_inode_dec_ref() to ext4_xattr_set_entry(), and have it only call delayed_iput() if the refcount went to zero. Alternatively, we could add a function to fs/ext4/orphan.c which checks whether the old_ea_inode is on the orphan list. And if it is on the orphan list, then we know that the refcount must have gone to zero, and so we need to call delayed_iput(). If the ea_inode isn't on the orphan list, we can just use iput(), which will be fast, since it will only be dropping i_count, and we don't need to worry about the ea_inode getting truncate. Could you take a look at this optimization, and then update the commit description to explain what's happening, inlude the kernel stack, and the reproducer? Many thanks! - Ted
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3f87cca49f0c..52db5d6bae7f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1650,8 +1650,11 @@ struct ext4_sb_info { struct flex_groups * __rcu *s_flex_groups; ext4_group_t s_flex_groups_allocated; - /* workqueue for reserved extent conversions (buffered io) */ - struct workqueue_struct *rsv_conversion_wq; + /* + * workqueue for reserved extent conversions (buffered io) + * and large ea inodes reclaim + */ + struct workqueue_struct *s_misc_wq; /* timer for periodic error stats printing */ struct timer_list s_err_report; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 495ce59fb4ad..0142b88471ff 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -228,7 +228,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); WARN_ON(!io_end->handle && sbi->s_journal); spin_lock_irqsave(&ei->i_completed_io_lock, flags); - wq = sbi->rsv_conversion_wq; + wq = sbi->s_misc_wq; if (list_empty(&ei->i_rsv_conversion_list)) queue_work(wq, &ei->i_rsv_conversion_work); list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 81749eaddf4c..ee03f593b264 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1200,10 +1200,11 @@ static void ext4_put_super(struct super_block *sb) int i, err; ext4_unregister_li_request(sb); + flush_workqueue(sbi->s_misc_wq); ext4_quota_off_umount(sb); flush_work(&sbi->s_error_work); - destroy_workqueue(sbi->rsv_conversion_wq); + destroy_workqueue(sbi->s_misc_wq); ext4_release_orphan_info(sb); /* @@ -5294,9 +5295,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * The maximum number of concurrent works can be high and * concurrency isn't really necessary. Limit it to 1. */ - EXT4_SB(sb)->rsv_conversion_wq = - alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); - if (!EXT4_SB(sb)->rsv_conversion_wq) { + EXT4_SB(sb)->s_misc_wq = + alloc_workqueue("ext4-misc", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); + if (!EXT4_SB(sb)->s_misc_wq) { printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); ret = -ENOMEM; goto failed_mount4; @@ -5514,8 +5515,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sb->s_root = NULL; failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); - if (EXT4_SB(sb)->rsv_conversion_wq) - destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); + if (EXT4_SB(sb)->s_misc_wq) + destroy_workqueue(EXT4_SB(sb)->s_misc_wq); failed_mount_wq: ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); sbi->s_ea_inode_cache = NULL; @@ -6129,7 +6130,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) return 0; trace_ext4_sync_fs(sb, wait); - flush_workqueue(sbi->rsv_conversion_wq); + flush_workqueue(sbi->s_misc_wq); /* * Writeback quota in non-journalled quota case - journalled quota has * no dirty dquots diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 042325349098..ee13675fbead 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1544,6 +1544,36 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, return 0; } +struct delayed_iput_work { + struct work_struct work; + struct inode *inode; +}; + +static void delayed_iput_fn(struct work_struct *work) +{ + struct delayed_iput_work *diwork; + + diwork = container_of(work, struct delayed_iput_work, work); + iput(diwork->inode); + kfree(diwork); +} + +static void delayed_iput(struct inode *inode, struct delayed_iput_work *work) +{ + if (!inode) { + kfree(work); + return; + } + + if (!work) { + iput(inode); + } else { + INIT_WORK(&work->work, delayed_iput_fn); + work->inode = inode; + queue_work(EXT4_SB(inode->i_sb)->s_misc_wq, &work->work); + } +} + /* * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode * feature is enabled. @@ -1561,6 +1591,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, int in_inode = i->in_inode; struct inode *old_ea_inode = NULL; struct inode *new_ea_inode = NULL; + struct delayed_iput_work *diwork = NULL; size_t old_size, new_size; int ret; @@ -1637,7 +1668,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, * Finish that work before doing any modifications to the xattr data. */ if (!s->not_found && here->e_value_inum) { - ret = ext4_xattr_inode_iget(inode, + diwork = kmalloc(sizeof(*diwork), GFP_NOFS); + if (!diwork) + ret = -ENOMEM; + else + ret = ext4_xattr_inode_iget(inode, le32_to_cpu(here->e_value_inum), le32_to_cpu(here->e_hash), &old_ea_inode); @@ -1790,7 +1825,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, ret = 0; out: - iput(old_ea_inode); + delayed_iput(old_ea_inode, diwork); iput(new_ea_inode); return ret; }