diff mbox series

[RFC,v7,4/7] ext4: fast commit: avoid self-deadlock in inode snapshotting

Message ID 20260511084304.1559557-5-me@linux.beauty
State Superseded
Headers show
Series ext4: fast commit: snapshot inode state for FC log | expand

Commit Message

Li Chen May 11, 2026, 8:42 a.m. UTC
ext4_fc_snapshot_inodes() used igrab()/iput() to pin inodes while building
commit-time snapshots. With ext4_fc_del() waiting for
EXT4_STATE_FC_COMMITTING, iput() can trigger
ext4_clear_inode()->ext4_fc_del() in the commit thread and deadlock waiting
for the fast commit to finish.

Avoid taking extra references. Collect inode pointers under s_fc_lock and
rely on EXT4_STATE_FC_COMMITTING to pin inodes until ext4_fc_cleanup()
clears the bit.

Also set EXT4_STATE_FC_COMMITTING for create-only inodes referenced
from the dentry update queue, and wake up waiters when ext4_fc_cleanup()
clears the bit.

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
---
 fs/ext4/fast_commit.c | 47 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 12 deletions(-)

Comments

Li Chen May 14, 2026, 7:28 a.m. UTC | #1
Hi sashiko,

 ---- On Tue, 12 May 2026 11:46:39 +0800   <sashiko-bot@kernel.org> wrote --- 
 > Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
 > - [Critical] Removing igrab() exposes a critical use-after-free vulnerability because ext4_fc_del() can miss the EXT4_STATE_FC_COMMITTING state transition and free the inode while it is actively being snapshotted.
 > --
 > 
 > commit 39c5450f50154240ff764cf3a63e011d8e973fd4
 > Author: Li Chen <me@linux.beauty>
 > 
 > ext4: fast commit: avoid self-deadlock in inode snapshotting
 > 
 > This commit removes igrab()/iput() during fast commit snapshotting, relying
 > instead on the EXT4_STATE_FC_COMMITTING flag to keep inodes pinned while
 > their pointers are stored in the inodes[] array for snapshotting.
 > 
 > > diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
 > > index 273bf34031aeb..f9bb18c0b5497 100644
 > > --- a/fs/ext4/fast_commit.c
 > > +++ b/fs/ext4/fast_commit.c
 > > @@ -1195,13 +1195,12 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
 > >  
 > >      alloc_ctx = ext4_fc_lock(sb);
 > >      list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 > > -        inodes[i] = igrab(&iter->vfs_inode);
 > > -        if (inodes[i])
 > > -            i++;
 > > +        inodes[i++] = &iter->vfs_inode;
 > >      }
 > 
 > Is it possible for this change to cause a use-after-free of the inode?
 > 
 > If ext4_fc_del() is called during inode eviction, it waits sequentially for
 > EXT4_STATE_FC_COMMITTING and then EXT4_STATE_FC_FLUSHING_DATA. However, it
 > doesn't appear to loop back to re-check EXT4_STATE_FC_COMMITTING after
 > waiting for the flush bit.
 > 
 > Could a sequence like this occur?
 > 
 > 1. ext4_fc_perform_commit() sets EXT4_STATE_FC_FLUSHING_DATA.
 > 2. Eviction calls ext4_fc_del(), which sees EXT4_STATE_FC_COMMITTING is
 >    false, but EXT4_STATE_FC_FLUSHING_DATA is true, so it waits on the
 >    flush bit.
 > 3. ext4_fc_perform_commit() clears EXT4_STATE_FC_FLUSHING_DATA, waking up
 >    ext4_fc_del().
 > 4. Before ext4_fc_del() resumes, ext4_fc_perform_commit() sets
 >    EXT4_STATE_FC_COMMITTING and saves the unpinned inode pointer into the
 >    inodes[] array.
 > 5. ext4_fc_del() resumes, skips re-checking EXT4_STATE_FC_COMMITTING,
 >    removes the inode from the fast commit list, and returns.
 > 6. The VFS completely frees the inode.
 > 7. ext4_fc_snapshot_inodes() iterates over the inodes[] array and passes
 >    the now dangling pointer to ext4_fc_snapshot_inode().
 > 
 > Does ext4_fc_del() need to re-verify the state flags after waking up, or
 > is there another mechanism preventing the inode from being freed before
 > ext4_fc_snapshot_inode() accesses it?
 > 
 > [ ... ]
 > 
 > -- 
 > Sashiko AI review · https://sashiko.dev/#/patchset/20260511084304.1559557-1-me@linux.beauty?part=4
 > 

Yes, I think you're right. I missed that window.

I'll make ext4_fc_del() loop back after waiting on FC_FLUSHING_DATA, so it checks FC_COMMITTING again
under s_fc_lock before removing the inode from the FC lists.

Regards,
Li​
diff mbox series

Patch

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 273bf34031ae..f9bb18c0b549 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1195,13 +1195,12 @@  static int ext4_fc_snapshot_inodes(journal_t *journal)
 
 	alloc_ctx = ext4_fc_lock(sb);
 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
-		inodes[i] = igrab(&iter->vfs_inode);
-		if (inodes[i])
-			i++;
+		inodes[i++] = &iter->vfs_inode;
 	}
 
 	list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
 		struct ext4_inode_info *ei;
+		struct inode *inode;
 
 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
 			continue;
@@ -1211,12 +1210,20 @@  static int ext4_fc_snapshot_inodes(journal_t *journal)
 		/* See the comment in ext4_fc_commit_dentry_updates(). */
 		ei = list_first_entry(&fc_dentry->fcd_dilist,
 				      struct ext4_inode_info, i_fc_dilist);
+		inode = &ei->vfs_inode;
 		if (!list_empty(&ei->i_fc_list))
 			continue;
 
-		inodes[i] = igrab(&ei->vfs_inode);
-		if (inodes[i])
-			i++;
+		/*
+		 * Create-only inodes may only be referenced via fcd_dilist and
+		 * not appear on s_fc_q[MAIN]. They may hit the last iput while
+		 * we are snapshotting, but inode eviction calls ext4_fc_del(),
+		 * which waits for FC_COMMITTING to clear. Mark them FC_COMMITTING
+		 * so the inode stays pinned and the snapshot stays valid until
+		 * ext4_fc_cleanup().
+		 */
+		ext4_set_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+		inodes[i++] = inode;
 	}
 	ext4_fc_unlock(sb, alloc_ctx);
 
@@ -1226,10 +1233,6 @@  static int ext4_fc_snapshot_inodes(journal_t *journal)
 			break;
 	}
 
-	for (nr_inodes = 0; nr_inodes < i; nr_inodes++) {
-		if (inodes[nr_inodes])
-			iput(inodes[nr_inodes]);
-	}
 	kvfree(inodes);
 	return ret;
 }
@@ -1297,8 +1300,9 @@  static int ext4_fc_perform_commit(journal_t *journal)
 	jbd2_journal_lock_updates(journal);
 	/*
 	 * The journal is now locked. No more handles can start and all the
-	 * previous handles are now drained. We now mark the inodes on the
-	 * commit queue as being committed.
+	 * previous handles are now drained. Snapshotting happens in this
+	 * window so log writing can consume only stable snapshots without
+	 * doing logical-to-physical mapping.
 	 */
 	alloc_ctx = ext4_fc_lock(sb);
 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
@@ -1550,6 +1554,25 @@  static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 					      struct ext4_inode_info,
 					      i_fc_dilist);
 			ext4_fc_free_inode_snap(&ei->vfs_inode);
+			spin_lock(&ei->i_fc_lock);
+			ext4_clear_inode_state(&ei->vfs_inode,
+					       EXT4_STATE_FC_REQUEUE);
+			ext4_clear_inode_state(&ei->vfs_inode,
+					       EXT4_STATE_FC_COMMITTING);
+			spin_unlock(&ei->i_fc_lock);
+			/*
+			 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+			 * visible before we send the wakeup. Pairs with implicit
+			 * barrier in prepare_to_wait() in ext4_fc_del().
+			 */
+			smp_mb();
+#if (BITS_PER_LONG < 64)
+			wake_up_bit(&ei->i_state_flags,
+				    EXT4_STATE_FC_COMMITTING);
+#else
+			wake_up_bit(&ei->i_flags,
+				    EXT4_STATE_FC_COMMITTING);
+#endif
 		}
 		list_del_init(&fc_dentry->fcd_dilist);