diff mbox series

[v3] ext4: fix bug for rename with RENAME_WHITEOUT

Message ID 20210105062857.3566-1-yangerkun@huawei.com
State Awaiting Upstream
Headers show
Series [v3] ext4: fix bug for rename with RENAME_WHITEOUT | expand

Commit Message

yangerkun Jan. 5, 2021, 6:28 a.m. UTC
We got a "deleted inode referenced" warning cross our fsstress test. The
bug can be reproduced easily with following steps:

  cd /dev/shm
  mkdir test/
  fallocate -l 128M img
  mkfs.ext4 -b 1024 img
  mount img test/
  dd if=/dev/zero of=test/foo bs=1M count=128
  mkdir test/dir/ && cd test/dir/
  for ((i=0;i<1000;i++)); do touch file$i; done # consume all block
  cd ~ && renameat2(AT_FDCWD, /dev/shm/test/dir/file1, AT_FDCWD,
    /dev/shm/test/dir/dst_file, RENAME_WHITEOUT) # ext4_add_entry in
    ext4_rename will return ENOSPC!!
  cd /dev/shm/ && umount test/ && mount img test/ && ls -li test/dir/file1
  We will get the output:
  "ls: cannot access 'test/dir/file1': Structure needs cleaning"
  and the dmesg show:
  "EXT4-fs error (device loop0): ext4_lookup:1626: inode #2049: comm ls:
  deleted inode referenced: 139"

ext4_rename will create a special inode for whiteout and use this 'ino'
to replace the source file's dir entry 'ino'. Once error happens
latter(the error above was the ENOSPC return from ext4_add_entry in
ext4_rename since all space has been consumed), the cleanup do drop the
nlink for whiteout, but forget to restore 'ino' with source file. This
will trigger the bug describle as above.

Signed-off-by: yangerkun <yangerkun@huawei.com>
---
 fs/ext4/namei.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

Comments

Jan Kara Jan. 5, 2021, 2:27 p.m. UTC | #1
On Tue 05-01-21 14:28:57, yangerkun wrote:
> We got a "deleted inode referenced" warning cross our fsstress test. The
> bug can be reproduced easily with following steps:
> 
>   cd /dev/shm
>   mkdir test/
>   fallocate -l 128M img
>   mkfs.ext4 -b 1024 img
>   mount img test/
>   dd if=/dev/zero of=test/foo bs=1M count=128
>   mkdir test/dir/ && cd test/dir/
>   for ((i=0;i<1000;i++)); do touch file$i; done # consume all block
>   cd ~ && renameat2(AT_FDCWD, /dev/shm/test/dir/file1, AT_FDCWD,
>     /dev/shm/test/dir/dst_file, RENAME_WHITEOUT) # ext4_add_entry in
>     ext4_rename will return ENOSPC!!
>   cd /dev/shm/ && umount test/ && mount img test/ && ls -li test/dir/file1
>   We will get the output:
>   "ls: cannot access 'test/dir/file1': Structure needs cleaning"
>   and the dmesg show:
>   "EXT4-fs error (device loop0): ext4_lookup:1626: inode #2049: comm ls:
>   deleted inode referenced: 139"
> 
> ext4_rename will create a special inode for whiteout and use this 'ino'
> to replace the source file's dir entry 'ino'. Once error happens
> latter(the error above was the ENOSPC return from ext4_add_entry in
> ext4_rename since all space has been consumed), the cleanup do drop the
> nlink for whiteout, but forget to restore 'ino' with source file. This
> will trigger the bug describle as above.
> 
> Signed-off-by: yangerkun <yangerkun@huawei.com>

Thanks! The patch looks good to me now. You can add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/ext4/namei.c | 17 +++++++++--------
>  1 file changed, 9 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index b17a082b7db1..90f7ebeb69c8 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -3593,9 +3593,6 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
>  			return retval2;
>  		}
>  	}
> -	brelse(ent->bh);
> -	ent->bh = NULL;
> -
>  	return retval;
>  }
>  
> @@ -3794,6 +3791,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
>  		}
>  	}
>  
> +	old_file_type = old.de->file_type;
>  	if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
>  		ext4_handle_sync(handle);
>  
> @@ -3821,7 +3819,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
>  	force_reread = (new.dir->i_ino == old.dir->i_ino &&
>  			ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));
>  
> -	old_file_type = old.de->file_type;
>  	if (whiteout) {
>  		/*
>  		 * Do this before adding a new entry, so the old entry is sure
> @@ -3919,15 +3916,19 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
>  	retval = 0;
>  
>  end_rename:
> -	brelse(old.dir_bh);
> -	brelse(old.bh);
> -	brelse(new.bh);
>  	if (whiteout) {
> -		if (retval)
> +		if (retval) {
> +			ext4_setent(handle, &old,
> +				old.inode->i_ino, old_file_type);
>  			drop_nlink(whiteout);
> +		}
>  		unlock_new_inode(whiteout);
>  		iput(whiteout);
> +
>  	}
> +	brelse(old.dir_bh);
> +	brelse(old.bh);
> +	brelse(new.bh);
>  	if (handle)
>  		ext4_journal_stop(handle);
>  	return retval;
> -- 
> 2.25.4
>
Theodore Ts'o Jan. 14, 2021, 3:51 a.m. UTC | #2
On Tue, Jan 05, 2021 at 02:28:57PM +0800, yangerkun wrote:
> We got a "deleted inode referenced" warning cross our fsstress test. The
> bug can be reproduced easily with following steps:
> 
>   cd /dev/shm
>   mkdir test/
>   fallocate -l 128M img
>   mkfs.ext4 -b 1024 img
>   mount img test/
>   dd if=/dev/zero of=test/foo bs=1M count=128
>   mkdir test/dir/ && cd test/dir/
>   for ((i=0;i<1000;i++)); do touch file$i; done # consume all block
>   cd ~ && renameat2(AT_FDCWD, /dev/shm/test/dir/file1, AT_FDCWD,
>     /dev/shm/test/dir/dst_file, RENAME_WHITEOUT) # ext4_add_entry in
>     ext4_rename will return ENOSPC!!
>   cd /dev/shm/ && umount test/ && mount img test/ && ls -li test/dir/file1
>   We will get the output:
>   "ls: cannot access 'test/dir/file1': Structure needs cleaning"
>   and the dmesg show:
>   "EXT4-fs error (device loop0): ext4_lookup:1626: inode #2049: comm ls:
>   deleted inode referenced: 139"
> 
> ext4_rename will create a special inode for whiteout and use this 'ino'
> to replace the source file's dir entry 'ino'. Once error happens
> latter(the error above was the ENOSPC return from ext4_add_entry in
> ext4_rename since all space has been consumed), the cleanup do drop the
> nlink for whiteout, but forget to restore 'ino' with source file. This
> will trigger the bug describle as above.
> 
> Signed-off-by: yangerkun <yangerkun@huawei.com>

Thanks, replied.

					- Ted
Amir Goldstein Jan. 20, 2021, 6:57 a.m. UTC | #3
On Thu, Jan 14, 2021 at 5:53 AM Theodore Ts'o <tytso@mit.edu> wrote:
>
> On Tue, Jan 05, 2021 at 02:28:57PM +0800, yangerkun wrote:
> > We got a "deleted inode referenced" warning cross our fsstress test. The
> > bug can be reproduced easily with following steps:
> >
> >   cd /dev/shm
> >   mkdir test/
> >   fallocate -l 128M img
> >   mkfs.ext4 -b 1024 img
> >   mount img test/
> >   dd if=/dev/zero of=test/foo bs=1M count=128
> >   mkdir test/dir/ && cd test/dir/
> >   for ((i=0;i<1000;i++)); do touch file$i; done # consume all block
> >   cd ~ && renameat2(AT_FDCWD, /dev/shm/test/dir/file1, AT_FDCWD,
> >     /dev/shm/test/dir/dst_file, RENAME_WHITEOUT) # ext4_add_entry in
> >     ext4_rename will return ENOSPC!!
> >   cd /dev/shm/ && umount test/ && mount img test/ && ls -li test/dir/file1
> >   We will get the output:
> >   "ls: cannot access 'test/dir/file1': Structure needs cleaning"
> >   and the dmesg show:
> >   "EXT4-fs error (device loop0): ext4_lookup:1626: inode #2049: comm ls:
> >   deleted inode referenced: 139"
> >
> > ext4_rename will create a special inode for whiteout and use this 'ino'
> > to replace the source file's dir entry 'ino'. Once error happens
> > latter(the error above was the ENOSPC return from ext4_add_entry in
> > ext4_rename since all space has been consumed), the cleanup do drop the
> > nlink for whiteout, but forget to restore 'ino' with source file. This
> > will trigger the bug describle as above.
> >
> > Signed-off-by: yangerkun <yangerkun@huawei.com>
>

Apropos RENAME_WHITEOUT, it seems to be missing __ext4_fc_track_link().
I guess test coverage of RENAME_WHITEOUT in fstests is not much.
I have been seeing trickles of bug fixes for RENAME_WHITEOUT for almost
every filesystem that supports it.

But I must say it would have been very hard to catch missing ext4_fc_track_*
without specialized fs fuzzer such as the CrashMonkey generated tests.

And as long as I am ranting, I'd like to point out that it is a shame
that whiteout
was not implemented as a special (constant) inode whose nlink is irrelevant
(or a special dirent with d_ino 0 and d_type DT_WHT for that matter).
It would have been a rather small RO_COMPAT on-disk change for ext4.
It could also be implemented in slightly more backward compat manner by
maintaining a valid nlink and postpone setting the RO_COMPAT flag until
EXT4_LINK_MAX is reached.

As things stand now, overlayfs makes an effort to maintain a singleton
hardlinked whiteout inode, without being able to use it with RENAME_WHITEOUT
and filesystems have to take special care to journal the metadata of all
individual whiteout inodes, without any added value to the only user
(overlayfs).

But I guess that train has left the station long ago...

Thanks,
Amir.
Miklos Szeredi Jan. 20, 2021, 8:42 a.m. UTC | #4
On Wed, Jan 20, 2021 at 7:57 AM Amir Goldstein <amir73il@gmail.com> wrote:

> And as long as I am ranting, I'd like to point out that it is a shame
> that whiteout
> was not implemented as a special (constant) inode whose nlink is irrelevant
> (or a special dirent with d_ino 0 and d_type DT_WHT for that matter).
> It would have been a rather small RO_COMPAT on-disk change for ext4.
> It could also be implemented in slightly more backward compat manner by
> maintaining a valid nlink and postpone setting the RO_COMPAT flag until
> EXT4_LINK_MAX is reached.
>
> As things stand now, overlayfs makes an effort to maintain a singleton
> hardlinked whiteout inode, without being able to use it with RENAME_WHITEOUT
> and filesystems have to take special care to journal the metadata of all
> individual whiteout inodes, without any added value to the only user
> (overlayfs).
>
> But I guess that train has left the station long ago...

Not so, I believe.  Kernel internal interfaces are easy to change, and
adding support for DT_WHT to overlayfs would mostly be a trivial
undertaking.

The big issue (as always) is userspace API's and not introducing
DT_WHT there was a very deliberate choice.  Adding a translation layer
from an internal whiteout representation to the userspace API also
does not seem to be a very complex problem, but I haven't looked into
that deeply.

So AFAICS there's really nothing preventing the addition of whiteout
objects to filesystems, other than developer dedication.

Thanks,
Miklos
harshad shirwadkar Jan. 22, 2021, 7:20 p.m. UTC | #5
Thanks Amir for pointing that out. Yes we are missing fast commit
tracking in whiteout. I'll send out a fix for that.

> But I must say it would have been very hard to catch missing ext4_fc_track_*
> without specialized fs fuzzer such as the CrashMonkey generated tests.

I agree, it's been on my to-do list to run CrashMonkey tests with fast
commits. I'm curious what kind of CrashMonkey tests you ran that
helped you catch this? Were you running Overlayfs on top of Ext4 with
fast commits?

Thanks,
Harshad

On Wed, Jan 20, 2021 at 12:42 AM Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> On Wed, Jan 20, 2021 at 7:57 AM Amir Goldstein <amir73il@gmail.com> wrote:
>
> > And as long as I am ranting, I'd like to point out that it is a shame
> > that whiteout
> > was not implemented as a special (constant) inode whose nlink is irrelevant
> > (or a special dirent with d_ino 0 and d_type DT_WHT for that matter).
> > It would have been a rather small RO_COMPAT on-disk change for ext4.
> > It could also be implemented in slightly more backward compat manner by
> > maintaining a valid nlink and postpone setting the RO_COMPAT flag until
> > EXT4_LINK_MAX is reached.
> >
> > As things stand now, overlayfs makes an effort to maintain a singleton
> > hardlinked whiteout inode, without being able to use it with RENAME_WHITEOUT
> > and filesystems have to take special care to journal the metadata of all
> > individual whiteout inodes, without any added value to the only user
> > (overlayfs).
> >
> > But I guess that train has left the station long ago...
>
> Not so, I believe.  Kernel internal interfaces are easy to change, and
> adding support for DT_WHT to overlayfs would mostly be a trivial
> undertaking.
>
> The big issue (as always) is userspace API's and not introducing
> DT_WHT there was a very deliberate choice.  Adding a translation layer
> from an internal whiteout representation to the userspace API also
> does not seem to be a very complex problem, but I haven't looked into
> that deeply.
>
> So AFAICS there's really nothing preventing the addition of whiteout
> objects to filesystems, other than developer dedication.
>
> Thanks,
> Miklos
Amir Goldstein Jan. 22, 2021, 8:32 p.m. UTC | #6
On Fri, Jan 22, 2021 at 9:21 PM harshad shirwadkar
<harshadshirwadkar@gmail.com> wrote:
>
> Thanks Amir for pointing that out. Yes we are missing fast commit
> tracking in whiteout. I'll send out a fix for that.
>
> > But I must say it would have been very hard to catch missing ext4_fc_track_*
> > without specialized fs fuzzer such as the CrashMonkey generated tests.
>
> I agree, it's been on my to-do list to run CrashMonkey tests with fast
> commits. I'm curious what kind of CrashMonkey tests you ran that
> helped you catch this? Were you running Overlayfs on top of Ext4 with
> fast commits?
>

Neither. I just guessed RENAME_WHITEOUT might be missed as
developers are rarely aware of it.
I never ran CrashMonkey tests myself.
I found a few crash consistency bugs using xfstest generic/455.
I suggest that you run it with fast commits
and try using NUM_OPS and NUM_FILES larger than the test defaults
to let the test run for a longer time.

Thanks,
Amir.
Amir Goldstein March 9, 2021, 7:30 a.m. UTC | #7
On Fri, Jan 22, 2021 at 9:21 PM harshad shirwadkar
<harshadshirwadkar@gmail.com> wrote:
>
> Thanks Amir for pointing that out. Yes we are missing fast commit
> tracking in whiteout. I'll send out a fix for that.
>

Ping.

Harshad,

Did you forget or did I miss the patch?

Thanks,
Amir.
diff mbox series

Patch

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index b17a082b7db1..90f7ebeb69c8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3593,9 +3593,6 @@  static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
 			return retval2;
 		}
 	}
-	brelse(ent->bh);
-	ent->bh = NULL;
-
 	return retval;
 }
 
@@ -3794,6 +3791,7 @@  static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		}
 	}
 
+	old_file_type = old.de->file_type;
 	if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
 		ext4_handle_sync(handle);
 
@@ -3821,7 +3819,6 @@  static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	force_reread = (new.dir->i_ino == old.dir->i_ino &&
 			ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));
 
-	old_file_type = old.de->file_type;
 	if (whiteout) {
 		/*
 		 * Do this before adding a new entry, so the old entry is sure
@@ -3919,15 +3916,19 @@  static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	retval = 0;
 
 end_rename:
-	brelse(old.dir_bh);
-	brelse(old.bh);
-	brelse(new.bh);
 	if (whiteout) {
-		if (retval)
+		if (retval) {
+			ext4_setent(handle, &old,
+				old.inode->i_ino, old_file_type);
 			drop_nlink(whiteout);
+		}
 		unlock_new_inode(whiteout);
 		iput(whiteout);
+
 	}
+	brelse(old.dir_bh);
+	brelse(old.bh);
+	brelse(new.bh);
 	if (handle)
 		ext4_journal_stop(handle);
 	return retval;