diff mbox series

[v3] ext4: Fix rec_len verify error

Message ID 20230801112337.1856215-1-zhangshida@kylinos.cn
State Superseded
Headers show
Series [v3] ext4: Fix rec_len verify error | expand

Commit Message

Stephen Zhang Aug. 1, 2023, 11:23 a.m. UTC
From: Shida Zhang <zhangshida@kylinos.cn>

With the configuration PAGE_SIZE 64k and filesystem blocksize 64k,
a problem occurred when more than 13 million files were directly created
under a directory:

EXT4-fs error (device xx): ext4_dx_csum_set:492: inode #xxxx: comm xxxxx: dir seems corrupt?  Run e2fsck -D.
EXT4-fs error (device xx): ext4_dx_csum_verify:463: inode #xxxx: comm xxxxx: dir seems corrupt?  Run e2fsck -D.
EXT4-fs error (device xx): dx_probe:856: inode #xxxx: block 8188: comm xxxxx: Directory index failed checksum

When enough files are created, the fake_dirent->reclen will be 0xffff.
it doesn't equal to the blocksize 65536, i.e. 0x10000.

But it is not the same condition when blocksize equals to 4k.
when enough files are created, the fake_dirent->reclen will be 0x1000.
it equals to the blocksize 4k, i.e. 0x1000.

The problem seems to be related to the limitation of the 16-bit field
when the blocksize is set to 64k.
To address this, helpers like ext4_rec_len_{from,to}_disk has already
been introduce to complete the conversion between the encoded and the
plain form of rec_len.

So fix this one by using the helper, and all the other
le16_to_cpu(ext4_dir_entry{,_2}.rec_len) accesses in this file too.

Cc: stable@kernel.org
Fixes: dbe89444042a ("ext4: Calculate and verify checksums for htree nodes")
Suggested-by: Andreas Dilger <adilger@dilger.ca>
Suggested-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Shida Zhang <zhangshida@kylinos.cn>
---
v1->v2:
 Use the existing helper to covert the rec_len, as suggested by Andreas.
v2->v3:
 1,Covert all the other rec_len if necessary, as suggested by Darrick.
 2,Rephrase the commit message.

 fs/ext4/namei.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

Comments

Darrick J. Wong Aug. 1, 2023, 3:18 p.m. UTC | #1
On Tue, Aug 01, 2023 at 07:23:37PM +0800, zhangshida wrote:
> From: Shida Zhang <zhangshida@kylinos.cn>
> 
> With the configuration PAGE_SIZE 64k and filesystem blocksize 64k,
> a problem occurred when more than 13 million files were directly created
> under a directory:
> 
> EXT4-fs error (device xx): ext4_dx_csum_set:492: inode #xxxx: comm xxxxx: dir seems corrupt?  Run e2fsck -D.
> EXT4-fs error (device xx): ext4_dx_csum_verify:463: inode #xxxx: comm xxxxx: dir seems corrupt?  Run e2fsck -D.
> EXT4-fs error (device xx): dx_probe:856: inode #xxxx: block 8188: comm xxxxx: Directory index failed checksum
> 
> When enough files are created, the fake_dirent->reclen will be 0xffff.
> it doesn't equal to the blocksize 65536, i.e. 0x10000.
> 
> But it is not the same condition when blocksize equals to 4k.
> when enough files are created, the fake_dirent->reclen will be 0x1000.
> it equals to the blocksize 4k, i.e. 0x1000.
> 
> The problem seems to be related to the limitation of the 16-bit field
> when the blocksize is set to 64k.
> To address this, helpers like ext4_rec_len_{from,to}_disk has already
> been introduce to complete the conversion between the encoded and the
> plain form of rec_len.
> 
> So fix this one by using the helper, and all the other
> le16_to_cpu(ext4_dir_entry{,_2}.rec_len) accesses in this file too.
> 
> Cc: stable@kernel.org
> Fixes: dbe89444042a ("ext4: Calculate and verify checksums for htree nodes")
> Suggested-by: Andreas Dilger <adilger@dilger.ca>
> Suggested-by: Darrick J. Wong <djwong@kernel.org>
> Signed-off-by: Shida Zhang <zhangshida@kylinos.cn>
> ---
> v1->v2:
>  Use the existing helper to covert the rec_len, as suggested by Andreas.
> v2->v3:
>  1,Covert all the other rec_len if necessary, as suggested by Darrick.
>  2,Rephrase the commit message.
> 
>  fs/ext4/namei.c | 16 ++++++++--------
>  1 file changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 0caf6c730ce3..8cb377b8ad86 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -346,14 +346,14 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
>  
>  #ifdef PARANOID
>  	struct ext4_dir_entry *d, *top;
> +	int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
>  
>  	d = (struct ext4_dir_entry *)bh->b_data;
>  	top = (struct ext4_dir_entry *)(bh->b_data +
> -		(EXT4_BLOCK_SIZE(inode->i_sb) -
> -		 sizeof(struct ext4_dir_entry_tail)));
> -	while (d < top && d->rec_len)
> +		(blocksize - sizeof(struct ext4_dir_entry_tail)));
> +	while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize))
>  		d = (struct ext4_dir_entry *)(((void *)d) +
> -		    le16_to_cpu(d->rec_len));
> +		    ext4_rec_len_from_disk(d->rec_len, blocksize));
>  
>  	if (d != top)
>  		return NULL;

This is sitll missing some pieces; what about this clause at line 367:

	if (t->det_reserved_zero1 ||
	    le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
	    t->det_reserved_zero2 ||
	    t->det_reserved_ft != EXT4_FT_DIR_CSUM)
		return NULL;

> @@ -445,13 +445,13 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
>  	struct ext4_dir_entry *dp;
>  	struct dx_root_info *root;
>  	int count_offset;
> +	int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
>  
> -	if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
> +	if (ext4_rec_len_from_disk(dirent->rec_len, blocksize) == blocksize)
>  		count_offset = 8;
> -	else if (le16_to_cpu(dirent->rec_len) == 12) {
> +	else if (ext4_rec_len_from_disk(dirent->rec_len, blocksize) == 12) {

Why not lift this ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ to a
local variable?  @dirent doesn't change, right?

>  		dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
> -		if (le16_to_cpu(dp->rec_len) !=
> -		    EXT4_BLOCK_SIZE(inode->i_sb) - 12)
> +		if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12)
>  			return NULL;
>  		root = (struct dx_root_info *)(((void *)dp + 12));
>  		if (root->reserved_zero ||

What about dx_make_map?

Here's all the opencoded access I could find:

$ git grep le16.*rec_len fs/ext4/
fs/ext4/namei.c:356:                le16_to_cpu(d->rec_len));
fs/ext4/namei.c:367:        le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
fs/ext4/namei.c:449:    if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
fs/ext4/namei.c:451:    else if (le16_to_cpu(dirent->rec_len) == 12) {
fs/ext4/namei.c:453:            if (le16_to_cpu(dp->rec_len) !=
fs/ext4/namei.c:1338:                   map_tail->size = le16_to_cpu(de->rec_len);

--D

> -- 
> 2.27.0
>
Stephen Zhang Aug. 2, 2023, 1:17 a.m. UTC | #2
Darrick J. Wong <djwong@kernel.org> 于2023年8月1日周二 23:18写道:
>
>
> This is sitll missing some pieces; what about this clause at line 367:
>
>         if (t->det_reserved_zero1 ||
>             le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
>             t->det_reserved_zero2 ||
>             t->det_reserved_ft != EXT4_FT_DIR_CSUM)
>                 return NULL;
>

Yeah...

> > @@ -445,13 +445,13 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
> >       struct ext4_dir_entry *dp;
> >       struct dx_root_info *root;
> >       int count_offset;
> > +     int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
> >
> > -     if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
> > +     if (ext4_rec_len_from_disk(dirent->rec_len, blocksize) == blocksize)
> >               count_offset = 8;
> > -     else if (le16_to_cpu(dirent->rec_len) == 12) {
> > +     else if (ext4_rec_len_from_disk(dirent->rec_len, blocksize) == 12) {
>
> Why not lift this ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ to a
> local variable?  @dirent doesn't change, right?
>

Will do.

> >               dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
> > -             if (le16_to_cpu(dp->rec_len) !=
> > -                 EXT4_BLOCK_SIZE(inode->i_sb) - 12)
> > +             if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12)
> >                       return NULL;
> >               root = (struct dx_root_info *)(((void *)dp + 12));
> >               if (root->reserved_zero ||
>
> What about dx_make_map?
>

dx_make_map(
     ....
     map_tail->size = le16_to_cpu(de->rec_len);

That might be a questionable one...
map_tail->size is 16 bit, while the key reason we want ext4_rec_len_from_disk
is converting 16-bit rec_len disk form to a in-memory form consisting of like 17
bits...i.e. 0x10000.

Cheers,
Shida


> Here's all the opencoded access I could find:
>
> $ git grep le16.*rec_len fs/ext4/
> fs/ext4/namei.c:356:                le16_to_cpu(d->rec_len));
> fs/ext4/namei.c:367:        le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
> fs/ext4/namei.c:449:    if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
> fs/ext4/namei.c:451:    else if (le16_to_cpu(dirent->rec_len) == 12) {
> fs/ext4/namei.c:453:            if (le16_to_cpu(dp->rec_len) !=
> fs/ext4/namei.c:1338:                   map_tail->size = le16_to_cpu(de->rec_len);
>
> --D
>
> > --
> > 2.27.0
> >
Andreas Dilger Aug. 2, 2023, 6:07 a.m. UTC | #3
On Aug 1, 2023, at 9:18 AM, Darrick J. Wong <djwong@kernel.org> wrote:
> 
> On Tue, Aug 01, 2023 at 07:23:37PM +0800, zhangshida wrote:
>> From: Shida Zhang <zhangshida@kylinos.cn>
>> 
>> With the configuration PAGE_SIZE 64k and filesystem blocksize 64k,
>> a problem occurred when more than 13 million files were directly created
>> under a directory:
>> 
>> EXT4-fs error (device xx): ext4_dx_csum_set:492: inode #xxxx: comm xxxxx: dir seems corrupt?  Run e2fsck -D.
>> EXT4-fs error (device xx): ext4_dx_csum_verify:463: inode #xxxx: comm xxxxx: dir seems corrupt?  Run e2fsck -D.
>> EXT4-fs error (device xx): dx_probe:856: inode #xxxx: block 8188: comm xxxxx: Directory index failed checksum
>> 
>> When enough files are created, the fake_dirent->reclen will be 0xffff.
>> it doesn't equal to the blocksize 65536, i.e. 0x10000.
>> 
>> But it is not the same condition when blocksize equals to 4k.
>> when enough files are created, the fake_dirent->reclen will be 0x1000.
>> it equals to the blocksize 4k, i.e. 0x1000.
>> 
>> The problem seems to be related to the limitation of the 16-bit field
>> when the blocksize is set to 64k.
>> To address this, helpers like ext4_rec_len_{from,to}_disk has already
>> been introduce to complete the conversion between the encoded and the
>> plain form of rec_len.
>> 
>> So fix this one by using the helper, and all the other
>> le16_to_cpu(ext4_dir_entry{,_2}.rec_len) accesses in this file too.
>> 
>> Cc: stable@kernel.org
>> Fixes: dbe89444042a ("ext4: Calculate and verify checksums for htree nodes")
>> Suggested-by: Andreas Dilger <adilger@dilger.ca>
>> Suggested-by: Darrick J. Wong <djwong@kernel.org>
>> Signed-off-by: Shida Zhang <zhangshida@kylinos.cn>
>> ---
>> v1->v2:
>> Use the existing helper to covert the rec_len, as suggested by Andreas.
>> v2->v3:
>> 1,Covert all the other rec_len if necessary, as suggested by Darrick.
>> 2,Rephrase the commit message.
>> 
>> fs/ext4/namei.c | 16 ++++++++--------
>> 1 file changed, 8 insertions(+), 8 deletions(-)
>> 
>> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
>> index 0caf6c730ce3..8cb377b8ad86 100644
>> --- a/fs/ext4/namei.c
>> +++ b/fs/ext4/namei.c
>> @@ -346,14 +346,14 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
>> 
>> #ifdef PARANOID
>> 	struct ext4_dir_entry *d, *top;
>> +	int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
>> 
>> 	d = (struct ext4_dir_entry *)bh->b_data;
>> 	top = (struct ext4_dir_entry *)(bh->b_data +
>> -		(EXT4_BLOCK_SIZE(inode->i_sb) -
>> -		 sizeof(struct ext4_dir_entry_tail)));
>> -	while (d < top && d->rec_len)
>> +		(blocksize - sizeof(struct ext4_dir_entry_tail)));
>> +	while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize))
>> 		d = (struct ext4_dir_entry *)(((void *)d) +
>> -		    le16_to_cpu(d->rec_len));
>> +		    ext4_rec_len_from_disk(d->rec_len, blocksize));
>> 
>> 	if (d != top)
>> 		return NULL;
> 
> This is sitll missing some pieces; what about this clause at line 367:
> 
> 	if (t->det_reserved_zero1 ||
> 	    le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
> 	    t->det_reserved_zero2 ||
> 	    t->det_reserved_ft != EXT4_FT_DIR_CSUM)
> 		return NULL;
> 
>> @@ -445,13 +445,13 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
>> 	struct ext4_dir_entry *dp;
>> 	struct dx_root_info *root;
>> 	int count_offset;
>> +	int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
>> 
>> -	if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
>> +	if (ext4_rec_len_from_disk(dirent->rec_len, blocksize) == blocksize)
>> 		count_offset = 8;
>> -	else if (le16_to_cpu(dirent->rec_len) == 12) {
>> +	else if (ext4_rec_len_from_disk(dirent->rec_len, blocksize) == 12) {
> 
> Why not lift this ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ to a
> local variable?  @dirent doesn't change, right?
> 
>> 		dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
>> -		if (le16_to_cpu(dp->rec_len) !=
>> -		    EXT4_BLOCK_SIZE(inode->i_sb) - 12)
>> +		if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12)
>> 			return NULL;
>> 		root = (struct dx_root_info *)(((void *)dp + 12));
>> 		if (root->reserved_zero ||
> 
> What about dx_make_map?
> 
> Here's all the opencoded access I could find:
> 
> $ git grep le16.*rec_len fs/ext4/
> fs/ext4/namei.c:356:                le16_to_cpu(d->rec_len));
> fs/ext4/namei.c:367:        le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
> fs/ext4/namei.c:449:    if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
> fs/ext4/namei.c:451:    else if (le16_to_cpu(dirent->rec_len) == 12) {
> fs/ext4/namei.c:453:            if (le16_to_cpu(dp->rec_len) !=
> fs/ext4/namei.c:1338:                   map_tail->size = le16_to_cpu(de->rec_len);

Not all of these cases are actual bugs.  The ext4_rec_len_from_disk()
function is only different for rec_len >= 2^16, so if it is comparing
rec_len against "12" or "sizeof(struct ...)" then the inequality will
be correct regardless of how it is decoded.

That said, it makes sense to use ext4_rec_len_from_disk() to access
rec_len consistently throughout the code, since that avoids potential
bugs in the future.  We know the code will eventually will be copied
some place where rec_len >= 2^16 is actually important, and we may as
well avoid that bug before it happens.


One thing this discussion *does* expose is that ext4_rec_len_from_disk()
is hard-coded at compile time to differentiate between PAGE_SIZE > 64k
and PAGE_SIZE = 4K, because it was never possible to have blocksize >
PAGE_SIZE, so only ARM/PPC ever had filesystems with blocksize=64KiB
(and the Fujitsu Fugaku SPARC system with blocksize=256KiB).

However, with the recent advent of the VM and IO layers allowing
blocksize > PAGE_SIZE this function will need to be changed to allow
the same on x86 PAGE_SIZE=4KiB systems.  Instead of checking

  #if PAGE_SIZE >= 65536

it should handle this based on the filesystem blocksize at runtime:

static inline
unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
{
        unsigned len = le16_to_cpu(dlen);

	if (blocksize < 65536)
		return len;

	if (len == EXT4_MAX_REC_LEN || len == 0)
		return blocksize;

	return (len & 65532) | ((len & 3) << 16);
}

Strictly speaking, ((len & 65532) | ((len & 3) << 16) should equal "len"
for any filesystem with blocksize < 65536, but IMHO it is more clear if
the code is written this way.

Similarly, the encoding needs to be changed to handle large records at
runtime for when we eventually allow ext4 with blocksize > PAGE_SIZE.

static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
{
	BUG_ON(len > blocksize);
	BUG_ON(blocksize > (1 << 18));
	BUG_ON(len & 3);

	if (len < 65536) /* always true for blocksize < 65536 */
		return cpu_to_le16(len);

	if (len == blocksize) {
		if (blocksize == 65536)
			return cpu_to_le16(EXT4_MAX_REC_LEN);

		return cpu_to_le16(0);
	}

	return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
}


Cheers, Andreas
Stephen Zhang Aug. 3, 2023, 1:52 a.m. UTC | #4
Andreas Dilger <adilger@dilger.ca> 于2023年8月2日周三 14:07写道:
>
> Not all of these cases are actual bugs.  The ext4_rec_len_from_disk()
> function is only different for rec_len >= 2^16, so if it is comparing
> rec_len against "12" or "sizeof(struct ...)" then the inequality will
> be correct regardless of how it is decoded.
>
> That said, it makes sense to use ext4_rec_len_from_disk() to access
> rec_len consistently throughout the code, since that avoids potential
> bugs in the future.  We know the code will eventually will be copied
> some place where rec_len >= 2^16 is actually important, and we may as
> well avoid that bug before it happens.
>
>
> One thing this discussion *does* expose is that ext4_rec_len_from_disk()
> is hard-coded at compile time to differentiate between PAGE_SIZE > 64k
> and PAGE_SIZE = 4K, because it was never possible to have blocksize >
> PAGE_SIZE, so only ARM/PPC ever had filesystems with blocksize=64KiB
> (and the Fujitsu Fugaku SPARC system with blocksize=256KiB).
>
> However, with the recent advent of the VM and IO layers allowing
> blocksize > PAGE_SIZE this function will need to be changed to allow
> the same on x86 PAGE_SIZE=4KiB systems.  Instead of checking
>
>   #if PAGE_SIZE >= 65536
>
> it should handle this based on the filesystem blocksize at runtime:
>
> static inline
> unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
> {
>         unsigned len = le16_to_cpu(dlen);
>
>         if (blocksize < 65536)
>                 return len;
>
>         if (len == EXT4_MAX_REC_LEN || len == 0)
>                 return blocksize;
>
>         return (len & 65532) | ((len & 3) << 16);
> }
>
> Strictly speaking, ((len & 65532) | ((len & 3) << 16) should equal "len"
> for any filesystem with blocksize < 65536, but IMHO it is more clear if
> the code is written this way.
>
> Similarly, the encoding needs to be changed to handle large records at
> runtime for when we eventually allow ext4 with blocksize > PAGE_SIZE.
>
> static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
> {
>         BUG_ON(len > blocksize);
>         BUG_ON(blocksize > (1 << 18));
>         BUG_ON(len & 3);
>
>         if (len < 65536) /* always true for blocksize < 65536 */
>                 return cpu_to_le16(len);
>
>         if (len == blocksize) {
>                 if (blocksize == 65536)
>                         return cpu_to_le16(EXT4_MAX_REC_LEN);
>
>                 return cpu_to_le16(0);
>         }
>
>         return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
> }
>

Hmm, at least it sounds reasonable to me based on my limited
knowledge. However, I am not sure whether you want me to incorporate
these changes into this particular commit or another patch within this
submission.

By default, I will simply leave it for further discussion. Please let
me know if you have any ideas.

Cheers,
Shida

>
> Cheers, Andreas
>
>
>
>
>
Darrick J. Wong Aug. 3, 2023, 3:09 a.m. UTC | #5
On Thu, Aug 03, 2023 at 09:52:53AM +0800, Stephen Zhang wrote:
> Andreas Dilger <adilger@dilger.ca> 于2023年8月2日周三 14:07写道:
> >
> > Not all of these cases are actual bugs.  The ext4_rec_len_from_disk()
> > function is only different for rec_len >= 2^16, so if it is comparing
> > rec_len against "12" or "sizeof(struct ...)" then the inequality will
> > be correct regardless of how it is decoded.
> >
> > That said, it makes sense to use ext4_rec_len_from_disk() to access
> > rec_len consistently throughout the code, since that avoids potential
> > bugs in the future.  We know the code will eventually will be copied
> > some place where rec_len >= 2^16 is actually important, and we may as
> > well avoid that bug before it happens.
> >
> >
> > One thing this discussion *does* expose is that ext4_rec_len_from_disk()
> > is hard-coded at compile time to differentiate between PAGE_SIZE > 64k
> > and PAGE_SIZE = 4K, because it was never possible to have blocksize >
> > PAGE_SIZE, so only ARM/PPC ever had filesystems with blocksize=64KiB
> > (and the Fujitsu Fugaku SPARC system with blocksize=256KiB).
> >
> > However, with the recent advent of the VM and IO layers allowing
> > blocksize > PAGE_SIZE this function will need to be changed to allow
> > the same on x86 PAGE_SIZE=4KiB systems.  Instead of checking
> >
> >   #if PAGE_SIZE >= 65536
> >
> > it should handle this based on the filesystem blocksize at runtime:
> >
> > static inline
> > unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
> > {
> >         unsigned len = le16_to_cpu(dlen);
> >
> >         if (blocksize < 65536)
> >                 return len;
> >
> >         if (len == EXT4_MAX_REC_LEN || len == 0)
> >                 return blocksize;
> >
> >         return (len & 65532) | ((len & 3) << 16);
> > }
> >
> > Strictly speaking, ((len & 65532) | ((len & 3) << 16) should equal "len"
> > for any filesystem with blocksize < 65536, but IMHO it is more clear if
> > the code is written this way.
> >
> > Similarly, the encoding needs to be changed to handle large records at
> > runtime for when we eventually allow ext4 with blocksize > PAGE_SIZE.
> >
> > static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
> > {
> >         BUG_ON(len > blocksize);
> >         BUG_ON(blocksize > (1 << 18));
> >         BUG_ON(len & 3);
> >
> >         if (len < 65536) /* always true for blocksize < 65536 */
> >                 return cpu_to_le16(len);
> >
> >         if (len == blocksize) {
> >                 if (blocksize == 65536)
> >                         return cpu_to_le16(EXT4_MAX_REC_LEN);
> >
> >                 return cpu_to_le16(0);
> >         }
> >
> >         return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
> > }
> >
> 
> Hmm, at least it sounds reasonable to me based on my limited
> knowledge. However, I am not sure whether you want me to incorporate
> these changes into this particular commit or another patch within this
> submission.
> 
> By default, I will simply leave it for further discussion. Please let
> me know if you have any ideas.

ext4 doesn't support blocksize > PAGE_SIZE yet.  Don't worry about this
for now.

--D

> Cheers,
> Shida
> 
> >
> > Cheers, Andreas
> >
> >
> >
> >
> >
Andreas Dilger Aug. 3, 2023, 10:34 p.m. UTC | #6
On Aug 2, 2023, at 9:09 PM, Darrick J. Wong <djwong@kernel.org> wrote:
> 
> On Thu, Aug 03, 2023 at 09:52:53AM +0800, Stephen Zhang wrote:
>> Andreas Dilger <adilger@dilger.ca> 于2023年8月2日周三 14:07写道:
>>> 
>>> Not all of these cases are actual bugs.  The ext4_rec_len_from_disk()
>>> function is only different for rec_len >= 2^16, so if it is comparing
>>> rec_len against "12" or "sizeof(struct ...)" then the inequality will
>>> be correct regardless of how it is decoded.
>>> 
>>> That said, it makes sense to use ext4_rec_len_from_disk() to access
>>> rec_len consistently throughout the code, since that avoids potential
>>> bugs in the future.  We know the code will eventually will be copied
>>> some place where rec_len >= 2^16 is actually important, and we may as
>>> well avoid that bug before it happens.
>>> 
>>> 
>>> One thing this discussion *does* expose is that ext4_rec_len_from_disk()
>>> is hard-coded at compile time to differentiate between PAGE_SIZE > 64k
>>> and PAGE_SIZE = 4K, because it was never possible to have blocksize >
>>> PAGE_SIZE, so only ARM/PPC ever had filesystems with blocksize=64KiB
>>> (and the Fujitsu Fugaku SPARC system with blocksize=256KiB).
>>> 
>>> However, with the recent advent of the VM and IO layers allowing
>>> blocksize > PAGE_SIZE this function will need to be changed to allow
>>> the same on x86 PAGE_SIZE=4KiB systems.  Instead of checking
>>> 
>>>  #if PAGE_SIZE >= 65536
>>> 
>>> it should handle this based on the filesystem blocksize at runtime:
>>> 
>>> static inline
>>> unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
>>> {
>>>        unsigned len = le16_to_cpu(dlen);
>>> 
>>>        if (blocksize < 65536)
>>>                return len;
>>> 
>>>        if (len == EXT4_MAX_REC_LEN || len == 0)
>>>                return blocksize;
>>> 
>>>        return (len & 65532) | ((len & 3) << 16);
>>> }
>>> 
>>> Strictly speaking, ((len & 65532) | ((len & 3) << 16) should equal "len"
>>> for any filesystem with blocksize < 65536, but IMHO it is more clear if
>>> the code is written this way.
>>> 
>>> Similarly, the encoding needs to be changed to handle large records at
>>> runtime for when we eventually allow ext4 with blocksize > PAGE_SIZE.
>>> 
>>> static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
>>> {
>>>        BUG_ON(len > blocksize);
>>>        BUG_ON(blocksize > (1 << 18));
>>>        BUG_ON(len & 3);
>>> 
>>>        if (len < 65536) /* always true for blocksize < 65536 */
>>>                return cpu_to_le16(len);
>>> 
>>>        if (len == blocksize) {
>>>                if (blocksize == 65536)
>>>                        return cpu_to_le16(EXT4_MAX_REC_LEN);
>>> 
>>>                return cpu_to_le16(0);
>>>        }
>>> 
>>>        return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
>>> }
>>> 
>> 
>> Hmm, at least it sounds reasonable to me based on my limited
>> knowledge. However, I am not sure whether you want me to incorporate
>> these changes into this particular commit or another patch within this
>> submission.
>> 
>> By default, I will simply leave it for further discussion. Please let
>> me know if you have any ideas.
> 
> ext4 doesn't support blocksize > PAGE_SIZE yet.  Don't worry about this
> for now.

I agree it doesn't need to be merged into the current patch.

It's something that could be fixed in a follow-on patch, to have one less
bug to fix in the future when ext4 *does* support blocksize > PAGE_SIZE,
which isn't so far away anymore.

Cheers, Andreas
Stephen Zhang Aug. 4, 2023, 2:11 a.m. UTC | #7
Andreas Dilger <adilger@dilger.ca> 于2023年8月4日周五 06:34写道:
>
> On Aug 2, 2023, at 9:09 PM, Darrick J. Wong <djwong@kernel.org> wrote:
> >
> > On Thu, Aug 03, 2023 at 09:52:53AM +0800, Stephen Zhang wrote:
> >> Andreas Dilger <adilger@dilger.ca> 于2023年8月2日周三 14:07写道:
> >>>
> >>> Not all of these cases are actual bugs.  The ext4_rec_len_from_disk()
> >>> function is only different for rec_len >= 2^16, so if it is comparing
> >>> rec_len against "12" or "sizeof(struct ...)" then the inequality will
> >>> be correct regardless of how it is decoded.
> >>>
> >>> That said, it makes sense to use ext4_rec_len_from_disk() to access
> >>> rec_len consistently throughout the code, since that avoids potential
> >>> bugs in the future.  We know the code will eventually will be copied
> >>> some place where rec_len >= 2^16 is actually important, and we may as
> >>> well avoid that bug before it happens.
> >>>
> >>>
> >>> One thing this discussion *does* expose is that ext4_rec_len_from_disk()
> >>> is hard-coded at compile time to differentiate between PAGE_SIZE > 64k
> >>> and PAGE_SIZE = 4K, because it was never possible to have blocksize >
> >>> PAGE_SIZE, so only ARM/PPC ever had filesystems with blocksize=64KiB
> >>> (and the Fujitsu Fugaku SPARC system with blocksize=256KiB).
> >>>
> >>> However, with the recent advent of the VM and IO layers allowing
> >>> blocksize > PAGE_SIZE this function will need to be changed to allow
> >>> the same on x86 PAGE_SIZE=4KiB systems.  Instead of checking
> >>>
> >>>  #if PAGE_SIZE >= 65536
> >>>
> >>> it should handle this based on the filesystem blocksize at runtime:
> >>>
> >>> static inline
> >>> unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
> >>> {
> >>>        unsigned len = le16_to_cpu(dlen);
> >>>
> >>>        if (blocksize < 65536)
> >>>                return len;
> >>>
> >>>        if (len == EXT4_MAX_REC_LEN || len == 0)
> >>>                return blocksize;
> >>>
> >>>        return (len & 65532) | ((len & 3) << 16);
> >>> }
> >>>
> >>> Strictly speaking, ((len & 65532) | ((len & 3) << 16) should equal "len"
> >>> for any filesystem with blocksize < 65536, but IMHO it is more clear if
> >>> the code is written this way.
> >>>
> >>> Similarly, the encoding needs to be changed to handle large records at
> >>> runtime for when we eventually allow ext4 with blocksize > PAGE_SIZE.
> >>>
> >>> static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
> >>> {
> >>>        BUG_ON(len > blocksize);
> >>>        BUG_ON(blocksize > (1 << 18));
> >>>        BUG_ON(len & 3);
> >>>
> >>>        if (len < 65536) /* always true for blocksize < 65536 */
> >>>                return cpu_to_le16(len);
> >>>
> >>>        if (len == blocksize) {
> >>>                if (blocksize == 65536)
> >>>                        return cpu_to_le16(EXT4_MAX_REC_LEN);
> >>>
> >>>                return cpu_to_le16(0);
> >>>        }
> >>>
> >>>        return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
> >>> }
> >>>
> >>
> >> Hmm, at least it sounds reasonable to me based on my limited
> >> knowledge. However, I am not sure whether you want me to incorporate
> >> these changes into this particular commit or another patch within this
> >> submission.
> >>
> >> By default, I will simply leave it for further discussion. Please let
> >> me know if you have any ideas.
> >
> > ext4 doesn't support blocksize > PAGE_SIZE yet.  Don't worry about this
> > for now.
>
> I agree it doesn't need to be merged into the current patch.
>
> It's something that could be fixed in a follow-on patch, to have one less
> bug to fix in the future when ext4 *does* support blocksize > PAGE_SIZE,
> which isn't so far away anymore.
>

Okay, I will attempt to submit another follow-on patch based on this discussion
after this one.

Cheers,
Shida

> Cheers, Andreas
>
>
>
>
>
diff mbox series

Patch

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0caf6c730ce3..8cb377b8ad86 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -346,14 +346,14 @@  static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
 
 #ifdef PARANOID
 	struct ext4_dir_entry *d, *top;
+	int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
 
 	d = (struct ext4_dir_entry *)bh->b_data;
 	top = (struct ext4_dir_entry *)(bh->b_data +
-		(EXT4_BLOCK_SIZE(inode->i_sb) -
-		 sizeof(struct ext4_dir_entry_tail)));
-	while (d < top && d->rec_len)
+		(blocksize - sizeof(struct ext4_dir_entry_tail)));
+	while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize))
 		d = (struct ext4_dir_entry *)(((void *)d) +
-		    le16_to_cpu(d->rec_len));
+		    ext4_rec_len_from_disk(d->rec_len, blocksize));
 
 	if (d != top)
 		return NULL;
@@ -445,13 +445,13 @@  static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
 	struct ext4_dir_entry *dp;
 	struct dx_root_info *root;
 	int count_offset;
+	int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
 
-	if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
+	if (ext4_rec_len_from_disk(dirent->rec_len, blocksize) == blocksize)
 		count_offset = 8;
-	else if (le16_to_cpu(dirent->rec_len) == 12) {
+	else if (ext4_rec_len_from_disk(dirent->rec_len, blocksize) == 12) {
 		dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
-		if (le16_to_cpu(dp->rec_len) !=
-		    EXT4_BLOCK_SIZE(inode->i_sb) - 12)
+		if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12)
 			return NULL;
 		root = (struct dx_root_info *)(((void *)dp + 12));
 		if (root->reserved_zero ||