diff mbox

fs: ext4: inode->i_generation not assigned 0.

Message ID c9f7090b2a58364998ad4a49cfdac379b905c7e9.1498687302.git.kkc6196@fb.com
State Rejected
Headers show

Commit Message

Kyungchan Koh June 28, 2017, 10:06 p.m. UTC
In fs/ext4/super.c, the function ext4_nfs_get_inode takes as input
"generation" that can be used to specify the generation of the inode to
be returned. When 0 is given as input, then inodes of any generation can
be returned. Therefore, generation 0 is a special case that should be
avoided when assigning generation to inodes.

A new inline function, ext4_inode_set_gen, will take care of the
problem.  Now, inodes cannot have a generation of 0, so this patch fixes
the issue.

Signed-off-by: Kyungchan Koh <kkc6196@fb.com>
---
 fs/ext4/ext4.h   | 8 ++++++++
 fs/ext4/ialloc.c | 2 +-
 fs/ext4/ioctl.c  | 4 ++--
 3 files changed, 11 insertions(+), 3 deletions(-)

Comments

Darrick Wong June 29, 2017, 12:48 a.m. UTC | #1
On Wed, Jun 28, 2017 at 03:06:42PM -0700, Kyungchan Koh wrote:
> In fs/ext4/super.c, the function ext4_nfs_get_inode takes as input
> "generation" that can be used to specify the generation of the inode to
> be returned. When 0 is given as input, then inodes of any generation can
> be returned. Therefore, generation 0 is a special case that should be
> avoided when assigning generation to inodes.
> 
> A new inline function, ext4_inode_set_gen, will take care of the
> problem.  Now, inodes cannot have a generation of 0, so this patch fixes
> the issue.

Forgive my ignorance, but why is generation == 0 a special case?

From a quick scan of the code it seems that filesystems hand out
handles to NFS with parent_{ino,gen} set (or zeroed).  That implies that
we have to check ino/gen for zeroes and garbage, but I don't see why
you'd exempt gen == 0 from checking?

(Really what I'm fishing for is whether or not there's some precedent
for this that I don't know about.)

--D

> 
> Signed-off-by: Kyungchan Koh <kkc6196@fb.com>
> ---
>  fs/ext4/ext4.h   | 8 ++++++++
>  fs/ext4/ialloc.c | 2 +-
>  fs/ext4/ioctl.c  | 4 ++--
>  3 files changed, 11 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 3219154..74c6677 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1549,6 +1549,14 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
>  		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
>  }
>  
> +static inline void ext4_inode_set_gen(struct inode *inode,
> +				      struct ext4_sb_info *sbi)
> +{
> +	inode->i_generation = sbi->s_next_generation++;
> +	if (!inode->i_generation)
> +		inode->i_generation = sbi->s_next_generation++;
> +}
> +
>  /*
>   * Inode dynamic state flags
>   */
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index 98ac2f1..d33f6f0 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -1072,7 +1072,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
>  		goto out;
>  	}
>  	spin_lock(&sbi->s_next_gen_lock);
> -	inode->i_generation = sbi->s_next_generation++;
> +	ext4_inode_set_gen(inode, sbi);
>  	spin_unlock(&sbi->s_next_gen_lock);
>  
>  	/* Precompute checksum seed for inode metadata */
> diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> index 0c21e22..d52a467 100644
> --- a/fs/ext4/ioctl.c
> +++ b/fs/ext4/ioctl.c
> @@ -160,8 +160,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
>  	inode->i_ctime = inode_bl->i_ctime = current_time(inode);
>  
>  	spin_lock(&sbi->s_next_gen_lock);
> -	inode->i_generation = sbi->s_next_generation++;
> -	inode_bl->i_generation = sbi->s_next_generation++;
> +	ext4_inode_set_gen(inode, sbi);
> +	ext4_inode_set_gen(inode_bl, sbi);
>  	spin_unlock(&sbi->s_next_gen_lock);
>  
>  	ext4_discard_preallocations(inode);
> -- 
> 2.9.3
>
Kyungchan Koh June 29, 2017, 12:58 a.m. UTC | #2
On 6/28/17, 5:48 PM, "Darrick J. Wong" <darrick.wong@oracle.com> wrote:

    On Wed, Jun 28, 2017 at 03:06:42PM -0700, Kyungchan Koh wrote:
    > In fs/ext4/super.c, the function ext4_nfs_get_inode takes as input

    > "generation" that can be used to specify the generation of the inode to

    > be returned. When 0 is given as input, then inodes of any generation can

    > be returned. Therefore, generation 0 is a special case that should be

    > avoided when assigning generation to inodes.

    > 

    > A new inline function, ext4_inode_set_gen, will take care of the

    > problem.  Now, inodes cannot have a generation of 0, so this patch fixes

    > the issue.

    
    Forgive my ignorance, but why is generation == 0 a special case?
    
    From a quick scan of the code it seems that filesystems hand out
    handles to NFS with parent_{ino,gen} set (or zeroed).  That implies that
    we have to check ino/gen for zeroes and garbage, but I don't see why
    you'd exempt gen == 0 from checking?
    
    (Really what I'm fishing for is whether or not there's some precedent
    for this that I don't know about.)
    
    --D
    
    > 

    > Signed-off-by: Kyungchan Koh <kkc6196@fb.com>

    > ---

    >  fs/ext4/ext4.h   | 8 ++++++++

    >  fs/ext4/ialloc.c | 2 +-

    >  fs/ext4/ioctl.c  | 4 ++--

    >  3 files changed, 11 insertions(+), 3 deletions(-)

    > 

    > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

    > index 3219154..74c6677 100644

    > --- a/fs/ext4/ext4.h

    > +++ b/fs/ext4/ext4.h

    > @@ -1549,6 +1549,14 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)

    >  		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));

    >  }

    >  

    > +static inline void ext4_inode_set_gen(struct inode *inode,

    > +				      struct ext4_sb_info *sbi)

    > +{

    > +	inode->i_generation = sbi->s_next_generation++;

    > +	if (!inode->i_generation)

    > +		inode->i_generation = sbi->s_next_generation++;

    > +}

    > +

    >  /*

    >   * Inode dynamic state flags

    >   */

    > diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c

    > index 98ac2f1..d33f6f0 100644

    > --- a/fs/ext4/ialloc.c

    > +++ b/fs/ext4/ialloc.c

    > @@ -1072,7 +1072,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,

    >  		goto out;

    >  	}

    >  	spin_lock(&sbi->s_next_gen_lock);

    > -	inode->i_generation = sbi->s_next_generation++;

    > +	ext4_inode_set_gen(inode, sbi);

    >  	spin_unlock(&sbi->s_next_gen_lock);

    >  

    >  	/* Precompute checksum seed for inode metadata */

    > diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c

    > index 0c21e22..d52a467 100644

    > --- a/fs/ext4/ioctl.c

    > +++ b/fs/ext4/ioctl.c

    > @@ -160,8 +160,8 @@ static long swap_inode_boot_loader(struct super_block *sb,

    >  	inode->i_ctime = inode_bl->i_ctime = current_time(inode);

    >  

    >  	spin_lock(&sbi->s_next_gen_lock);

    > -	inode->i_generation = sbi->s_next_generation++;

    > -	inode_bl->i_generation = sbi->s_next_generation++;

    > +	ext4_inode_set_gen(inode, sbi);

    > +	ext4_inode_set_gen(inode_bl, sbi);

    >  	spin_unlock(&sbi->s_next_gen_lock);

    >  

    >  	ext4_discard_preallocations(inode);

    > -- 

    > 2.9.3

    > 

     

Generation == 0 seems to be a special case for many filesystems, not just ext4. For jfs, in jfs_nfs_get_inode, if the input generation is 0, then no inodes are returned. Such filesystems that seem to treat generation 0 as a special case nfs_get_inode that I have found so far are ext2, ext4, jfs,  exofs, and f2fs. Therefore, I was actually thinking about implementing a shared helper in linux/fs.h that has the prototype “static inline void inode_set_gen(struct inode *inode, unsigned int *generation)” that can be used for all filesystems. For example, for jfs, I can do “inode_set_gen(inode, &JFS_SBI(sb)->gengen);” or for extX, I can do “inode_set_gen(inode, &EXTX_SB(sb)->s_next_generation);”. This allows a cleaner change of adding a few lines of code to linux/fs.h and replacing one to a few lines for each filesystem. I am open to both options, if anyone has a strong preference for either option.

Best,
Kyungchan Koh
Andreas Dilger June 29, 2017, 2:32 a.m. UTC | #3
On Jun 28, 2017, at 4:06 PM, Kyungchan Koh <kkc6196@fb.com> wrote:
> 
> In fs/ext4/super.c, the function ext4_nfs_get_inode takes as input
> "generation" that can be used to specify the generation of the inode to
> be returned. When 0 is given as input, then inodes of any generation can
> be returned. Therefore, generation 0 is a special case that should be
> avoided when assigning generation to inodes.

I'd agree with this change to avoid assigning generation == 0 to real inodes.

Also, the separate question arises about whether we need to allow file handle
lookup with generation == 0?  That allows FID guessing easily, while requiring
a non-zero generation makes that a lot harder.

What are the cases where generation == 0 are used?

> A new inline function, ext4_inode_set_gen, will take care of the
> problem.  Now, inodes cannot have a generation of 0, so this patch fixes
> the issue.
> 
> Signed-off-by: Kyungchan Koh <kkc6196@fb.com>
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 3219154..74c6677 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1549,6 +1549,14 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
> 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
> }
> 
> +static inline void ext4_inode_set_gen(struct inode *inode,
> +				      struct ext4_sb_info *sbi)
> +{
> +	inode->i_generation = sbi->s_next_generation++;
> +	if (!inode->i_generation)

This should be marked "unlikely()" since it happens at most once every 4B
file creations (though likely even less since it is unlikely that so many
files will be created in a single mount).

> +		inode->i_generation = sbi->s_next_generation++;
> +}
> +
> 
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index 98ac2f1..d33f6f0 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -1072,7 +1072,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode	}
> 	spin_lock(&sbi->s_next_gen_lock);
> -	inode->i_generation = sbi->s_next_generation++;
> +	ext4_inode_set_gen(inode, sbi);
> 	spin_unlock(&sbi->s_next_gen_lock);
> 
> diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> index 0c21e22..d52a467 100644
> --- a/fs/ext4/ioctl.c
> +++ b/fs/ext4/ioctl.c
> @@ -160,8 +160,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
> 
> 	spin_lock(&sbi->s_next_gen_lock);
> -	inode->i_generation = sbi->s_next_generation++;
> -	inode_bl->i_generation = sbi->s_next_generation++;
> +	ext4_inode_set_gen(inode, sbi);
> +	ext4_inode_set_gen(inode_bl, sbi);
> 	spin_unlock(&sbi->s_next_gen_lock);
> 


Cheers, Andreas
Kyungchan Koh June 29, 2017, 4:37 a.m. UTC | #4
On 6/28/17, 7:32 PM, "Andreas Dilger" <adilger@dilger.ca> wrote:

    On Jun 28, 2017, at 4:06 PM, Kyungchan Koh <kkc6196@fb.com> wrote:
    > 

    > In fs/ext4/super.c, the function ext4_nfs_get_inode takes as input

    > "generation" that can be used to specify the generation of the inode to

    > be returned. When 0 is given as input, then inodes of any generation can

    > be returned. Therefore, generation 0 is a special case that should be

    > avoided when assigning generation to inodes.

    
    I'd agree with this change to avoid assigning generation == 0 to real inodes.
    
    Also, the separate question arises about whether we need to allow file handle
    lookup with generation == 0?  That allows FID guessing easily, while requiring
    a non-zero generation makes that a lot harder.
    
    What are the cases where generation == 0 are used?

Honestly, I’m not too sure. I just noticed that generation 0 was a special
case from reading the code.

    > A new inline function, ext4_inode_set_gen, will take care of the

    > problem.  Now, inodes cannot have a generation of 0, so this patch fixes

    > the issue.

    > 

    > Signed-off-by: Kyungchan Koh <kkc6196@fb.com>

    > 

    > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

    > index 3219154..74c6677 100644

    > --- a/fs/ext4/ext4.h

    > +++ b/fs/ext4/ext4.h

    > @@ -1549,6 +1549,14 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)

    > 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));

    > }

    > 

    > +static inline void ext4_inode_set_gen(struct inode *inode,

    > +				      struct ext4_sb_info *sbi)

    > +{

    > +	inode->i_generation = sbi->s_next_generation++;

    > +	if (!inode->i_generation)

    
    This should be marked "unlikely()" since it happens at most once every 4B
    file creations (though likely even less since it is unlikely that so many
    files will be created in a single mount).

Got it.
    
    > +		inode->i_generation = sbi->s_next_generation++;

    > +}

    > +

    > 

    > diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c

    > index 98ac2f1..d33f6f0 100644

    > --- a/fs/ext4/ialloc.c

    > +++ b/fs/ext4/ialloc.c

    > @@ -1072,7 +1072,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode	}

    > 	spin_lock(&sbi->s_next_gen_lock);

    > -	inode->i_generation = sbi->s_next_generation++;

    > +	ext4_inode_set_gen(inode, sbi);

    > 	spin_unlock(&sbi->s_next_gen_lock);

    > 

    > diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c

    > index 0c21e22..d52a467 100644

    > --- a/fs/ext4/ioctl.c

    > +++ b/fs/ext4/ioctl.c

    > @@ -160,8 +160,8 @@ static long swap_inode_boot_loader(struct super_block *sb,

    > 

    > 	spin_lock(&sbi->s_next_gen_lock);

    > -	inode->i_generation = sbi->s_next_generation++;

    > -	inode_bl->i_generation = sbi->s_next_generation++;

    > +	ext4_inode_set_gen(inode, sbi);

    > +	ext4_inode_set_gen(inode_bl, sbi);

    > 	spin_unlock(&sbi->s_next_gen_lock);

    > 

    
    
    Cheers, Andreas

This is applicable to many fs, including ext2, ext4, exofs, jfs, and f2fs.
Therefore, a shared helper in linux/fs.h will allow for easy changes
in all fs. Is there any reason that might be a bad idea?

Best,
Kyungchan Koh
Darrick Wong June 29, 2017, 4:59 a.m. UTC | #5
[add linux-xfs to cc]

On Thu, Jun 29, 2017 at 04:37:14AM +0000, William Koh wrote:
> On 6/28/17, 7:32 PM, "Andreas Dilger" <adilger@dilger.ca> wrote:
> 
>     On Jun 28, 2017, at 4:06 PM, Kyungchan Koh <kkc6196@fb.com> wrote:
>     > 
>     > In fs/ext4/super.c, the function ext4_nfs_get_inode takes as input
>     > "generation" that can be used to specify the generation of the inode to
>     > be returned. When 0 is given as input, then inodes of any generation can
>     > be returned. Therefore, generation 0 is a special case that should be
>     > avoided when assigning generation to inodes.
>     
>     I'd agree with this change to avoid assigning generation == 0 to real inodes.
>     
>     Also, the separate question arises about whether we need to allow file handle
>     lookup with generation == 0?  That allows FID guessing easily, while requiring
>     a non-zero generation makes that a lot harder.
>     
>     What are the cases where generation == 0 are used?
> 
> Honestly, I’m not too sure. I just noticed that generation 0 was a special
> case from reading the code.
> 
>     > A new inline function, ext4_inode_set_gen, will take care of the
>     > problem.  Now, inodes cannot have a generation of 0, so this patch fixes
>     > the issue.
>     > 
>     > Signed-off-by: Kyungchan Koh <kkc6196@fb.com>
>     > 
>     > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>     > index 3219154..74c6677 100644
>     > --- a/fs/ext4/ext4.h
>     > +++ b/fs/ext4/ext4.h
>     > @@ -1549,6 +1549,14 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
>     > 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
>     > }
>     > 
>     > +static inline void ext4_inode_set_gen(struct inode *inode,
>     > +				      struct ext4_sb_info *sbi)
>     > +{
>     > +	inode->i_generation = sbi->s_next_generation++;
>     > +	if (!inode->i_generation)
>     
>     This should be marked "unlikely()" since it happens at most once every 4B
>     file creations (though likely even less since it is unlikely that so many
>     files will be created in a single mount).
> 
> Got it.
>     
>     > +		inode->i_generation = sbi->s_next_generation++;
>     > +}
>     > +
>     > 
>     > diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
>     > index 98ac2f1..d33f6f0 100644
>     > --- a/fs/ext4/ialloc.c
>     > +++ b/fs/ext4/ialloc.c
>     > @@ -1072,7 +1072,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode	}
>     > 	spin_lock(&sbi->s_next_gen_lock);
>     > -	inode->i_generation = sbi->s_next_generation++;
>     > +	ext4_inode_set_gen(inode, sbi);
>     > 	spin_unlock(&sbi->s_next_gen_lock);
>     > 
>     > diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
>     > index 0c21e22..d52a467 100644
>     > --- a/fs/ext4/ioctl.c
>     > +++ b/fs/ext4/ioctl.c
>     > @@ -160,8 +160,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
>     > 
>     > 	spin_lock(&sbi->s_next_gen_lock);
>     > -	inode->i_generation = sbi->s_next_generation++;
>     > -	inode_bl->i_generation = sbi->s_next_generation++;
>     > +	ext4_inode_set_gen(inode, sbi);
>     > +	ext4_inode_set_gen(inode_bl, sbi);
>     > 	spin_unlock(&sbi->s_next_gen_lock);
>     > 
>     
>     
>     Cheers, Andreas
> 
> This is applicable to many fs, including ext2, ext4, exofs, jfs, and f2fs.
> Therefore, a shared helper in linux/fs.h will allow for easy changes
> in all fs. Is there any reason that might be a bad idea?

AFAICT, i_generation == 0 in XFS and btrfs is just as valid as any other
number.  There is no special casing of zero in either filesystem.

So now, my curiosity intrigued, I surveyed all the Linux filesystems
that can export to NFS.  I see that there are actually quite a few fs
(ext[2-4], exofs, efs, fat, jfs, f2fs, isofs, nilfs2, reiserfs, udf,
ufs) that treat zero as a special value meaning "ignore generation
check"; others (xfs, btrfs, fuse, ntfs, ocfs2) that don't consider zero
special and always require a match; and still others (affs, befs, ceph,
gfs2, jffs2, squashfs) that don't check at all.

That to mean strongly suggests that more research is necessary to figure
out why some of the filesystems that support i_generation reserve zero
as a special value to disable generation checks and why others always
require an exact match.  Until we can recapture why things are they way
they are, it doesn't make much sense to have a helper that only applies
to half the filesystems.

Granted, the contents of a file handle are generally left up to the
individual filesystem, and the behaviors are very different, so I also
don't see that much value in hoisting i_generation updates to the VFS
level.

I guess it wouldn't really matter if XFS stopped writing i_generation =
0 onto disk, but I'm too curious about this odd difference in behavior
to let it go just yet. :)

--D

> 
> Best,
> Kyungchan Koh 
>     
>     
>     
>     
>
Kyungchan Koh June 29, 2017, 2:28 p.m. UTC | #6
On 6/28/17, 9:59 PM, "Darrick J. Wong" <darrick.wong@oracle.com> wrote:

    [add linux-xfs to cc]
    
    On Thu, Jun 29, 2017 at 04:37:14AM +0000, William Koh wrote:
    > On 6/28/17, 7:32 PM, "Andreas Dilger" <adilger@dilger.ca> wrote:

    > 

    >     On Jun 28, 2017, at 4:06 PM, Kyungchan Koh <kkc6196@fb.com> wrote:

    >     > 

    >     > In fs/ext4/super.c, the function ext4_nfs_get_inode takes as input

    >     > "generation" that can be used to specify the generation of the inode to

    >     > be returned. When 0 is given as input, then inodes of any generation can

    >     > be returned. Therefore, generation 0 is a special case that should be

    >     > avoided when assigning generation to inodes.

    >     

    >     I'd agree with this change to avoid assigning generation == 0 to real inodes.

    >     

    >     Also, the separate question arises about whether we need to allow file handle

    >     lookup with generation == 0?  That allows FID guessing easily, while requiring

    >     a non-zero generation makes that a lot harder.

    >     

    >     What are the cases where generation == 0 are used?

    > 

    > Honestly, I’m not too sure. I just noticed that generation 0 was a special

    > case from reading the code.

    > 

    >     > A new inline function, ext4_inode_set_gen, will take care of the

    >     > problem.  Now, inodes cannot have a generation of 0, so this patch fixes

    >     > the issue.

    >     > 

    >     > Signed-off-by: Kyungchan Koh <kkc6196@fb.com>

    >     > 

    >     > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

    >     > index 3219154..74c6677 100644

    >     > --- a/fs/ext4/ext4.h

    >     > +++ b/fs/ext4/ext4.h

    >     > @@ -1549,6 +1549,14 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)

    >     > 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));

    >     > }

    >     > 

    >     > +static inline void ext4_inode_set_gen(struct inode *inode,

    >     > +				      struct ext4_sb_info *sbi)

    >     > +{

    >     > +	inode->i_generation = sbi->s_next_generation++;

    >     > +	if (!inode->i_generation)

    >     

    >     This should be marked "unlikely()" since it happens at most once every 4B

    >     file creations (though likely even less since it is unlikely that so many

    >     files will be created in a single mount).

    > 

    > Got it.

    >     

    >     > +		inode->i_generation = sbi->s_next_generation++;

    >     > +}

    >     > +

    >     > 

    >     > diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c

    >     > index 98ac2f1..d33f6f0 100644

    >     > --- a/fs/ext4/ialloc.c

    >     > +++ b/fs/ext4/ialloc.c

    >     > @@ -1072,7 +1072,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode	}

    >     > 	spin_lock(&sbi->s_next_gen_lock);

    >     > -	inode->i_generation = sbi->s_next_generation++;

    >     > +	ext4_inode_set_gen(inode, sbi);

    >     > 	spin_unlock(&sbi->s_next_gen_lock);

    >     > 

    >     > diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c

    >     > index 0c21e22..d52a467 100644

    >     > --- a/fs/ext4/ioctl.c

    >     > +++ b/fs/ext4/ioctl.c

    >     > @@ -160,8 +160,8 @@ static long swap_inode_boot_loader(struct super_block *sb,

    >     > 

    >     > 	spin_lock(&sbi->s_next_gen_lock);

    >     > -	inode->i_generation = sbi->s_next_generation++;

    >     > -	inode_bl->i_generation = sbi->s_next_generation++;

    >     > +	ext4_inode_set_gen(inode, sbi);

    >     > +	ext4_inode_set_gen(inode_bl, sbi);

    >     > 	spin_unlock(&sbi->s_next_gen_lock);

    >     > 

    >     

    >     

    >     Cheers, Andreas

    > 

    > This is applicable to many fs, including ext2, ext4, exofs, jfs, and f2fs.

    > Therefore, a shared helper in linux/fs.h will allow for easy changes

    > in all fs. Is there any reason that might be a bad idea?

    
    AFAICT, i_generation == 0 in XFS and btrfs is just as valid as any other
    number.  There is no special casing of zero in either filesystem.
    
    So now, my curiosity intrigued, I surveyed all the Linux filesystems
    that can export to NFS.  I see that there are actually quite a few fs
    (ext[2-4], exofs, efs, fat, jfs, f2fs, isofs, nilfs2, reiserfs, udf,
    ufs) that treat zero as a special value meaning "ignore generation
    check"; others (xfs, btrfs, fuse, ntfs, ocfs2) that don't consider zero
    special and always require a match; and still others (affs, befs, ceph,
    gfs2, jffs2, squashfs) that don't check at all.
    
    That to mean strongly suggests that more research is necessary to figure
    out why some of the filesystems that support i_generation reserve zero
    as a special value to disable generation checks and why others always
    require an exact match.  Until we can recapture why things are they way
    they are, it doesn't make much sense to have a helper that only applies
    to half the filesystems.
    
    Granted, the contents of a file handle are generally left up to the
    individual filesystem, and the behaviors are very different, so I also
    don't see that much value in hoisting i_generation updates to the VFS
    level.
    
    I guess it wouldn't really matter if XFS stopped writing i_generation =
    0 onto disk, but I'm too curious about this odd difference in behavior
    to let it go just yet. :)
    
    --D

That makes sense. I’ll try to also look into this matter and send a
newer patch with the most optimal fix to this issue.

-Kyungchan Koh
    
    > 

    > Best,

    > Kyungchan Koh 

    >     

    >     

    >     

    >     

    >
J. Bruce Fields June 29, 2017, 2:35 p.m. UTC | #7
On Wed, Jun 28, 2017 at 09:59:40PM -0700, Darrick J. Wong wrote:
> AFAICT, i_generation == 0 in XFS and btrfs is just as valid as any other
> number.  There is no special casing of zero in either filesystem.
> 
> So now, my curiosity intrigued, I surveyed all the Linux filesystems
> that can export to NFS.  I see that there are actually quite a few fs
> (ext[2-4], exofs, efs, fat, jfs, f2fs, isofs, nilfs2, reiserfs, udf,
> ufs) that treat zero as a special value meaning "ignore generation
> check"; others (xfs, btrfs, fuse, ntfs, ocfs2) that don't consider zero
> special and always require a match; and still others (affs, befs, ceph,
> gfs2, jffs2, squashfs) that don't check at all.
> 
> That to mean strongly suggests that more research is necessary to figure
> out why some of the filesystems that support i_generation reserve zero
> as a special value to disable generation checks and why others always
> require an exact match.  Until we can recapture why things are they way
> they are, it doesn't make much sense to have a helper that only applies
> to half the filesystems.

From a quick look at ext2; correct me if I got anything wrong:

	- it looks like this is *only* used by NFS fh->inode lookups,
	  there's not some other internal use for this special case.
	- filehandles are never encoded with i_generation 0, so will
	  never be returned to clients.

So, this could only ever be used by an NFS client.  But the only NFS
client that could ever use it would be a non-standard client that knew
this special feature of these particular filesystems.

Sounds like at most a slightly useful tool for malicious clients
attempting filehandle-guessing attacks.

I'm probably missing something.

--b.
Darrick Wong June 29, 2017, 5:25 p.m. UTC | #8
On Thu, Jun 29, 2017 at 10:35:51AM -0400, J. Bruce Fields wrote:
> On Wed, Jun 28, 2017 at 09:59:40PM -0700, Darrick J. Wong wrote:
> > AFAICT, i_generation == 0 in XFS and btrfs is just as valid as any other
> > number.  There is no special casing of zero in either filesystem.
> > 
> > So now, my curiosity intrigued, I surveyed all the Linux filesystems
> > that can export to NFS.  I see that there are actually quite a few fs
> > (ext[2-4], exofs, efs, fat, jfs, f2fs, isofs, nilfs2, reiserfs, udf,
> > ufs) that treat zero as a special value meaning "ignore generation
> > check"; others (xfs, btrfs, fuse, ntfs, ocfs2) that don't consider zero
> > special and always require a match; and still others (affs, befs, ceph,
> > gfs2, jffs2, squashfs) that don't check at all.
> > 
> > That to mean strongly suggests that more research is necessary to figure
> > out why some of the filesystems that support i_generation reserve zero
> > as a special value to disable generation checks and why others always
> > require an exact match.  Until we can recapture why things are they way
> > they are, it doesn't make much sense to have a helper that only applies
> > to half the filesystems.
> 
> From a quick look at ext2; correct me if I got anything wrong:
> 
> 	- it looks like this is *only* used by NFS fh->inode lookups,
> 	  there's not some other internal use for this special case.
> 	- filehandles are never encoded with i_generation 0, so will
> 	  never be returned to clients.
> 
> So, this could only ever be used by an NFS client.  But the only NFS
> client that could ever use it would be a non-standard client that knew
> this special feature of these particular filesystems.
> 
> Sounds like at most a slightly useful tool for malicious clients
> attempting filehandle-guessing attacks.
> 
> I'm probably missing something.

Was there ever a version of NFS (or more generally callers of the
exportfs code) that couldn't deal with i_generation in the file handle,
and therefore we invented this generation hack to work around the loss
of the generation information?

There's a comment in xfs_fs_encode_fh about not supporting 64bit inodes
with subtree_check (which seems to require one ino/gen pair for the file
and a second pair for the file's parent) on NFSv2 because v2 doesn't
provide enough space for all the file handle information, but that's the
furthest I got with lazy-mining the git history. :)

--D

> 
> --b.
J. Bruce Fields June 29, 2017, 6:30 p.m. UTC | #9
On Thu, Jun 29, 2017 at 10:25:28AM -0700, Darrick J. Wong wrote:
> Was there ever a version of NFS (or more generally callers of the
> exportfs code) that couldn't deal with i_generation in the file handle,
> and therefore we invented this generation hack to work around the loss
> of the generation information?
> 
> There's a comment in xfs_fs_encode_fh about not supporting 64bit inodes
> with subtree_check (which seems to require one ino/gen pair for the file
> and a second pair for the file's parent) on NFSv2 because v2 doesn't
> provide enough space for all the file handle information, but that's the
> furthest I got with lazy-mining the git history. :)

There's a comment in fs/ext4/super.c:ext4_nfs_get_inode

	* Currently we don't know the generation for parent directory, so
	* a generation of 0 means "accept any"

But I don't see that used.

It was used once upon a time; I see it actually used in old 2.5 code in
nfsd_get_dentry.  Hm.

--b.
J. Bruce Fields June 29, 2017, 6:50 p.m. UTC | #10
On Thu, Jun 29, 2017 at 02:30:53PM -0400, J. Bruce Fields wrote:
> On Thu, Jun 29, 2017 at 10:25:28AM -0700, Darrick J. Wong wrote:
> > Was there ever a version of NFS (or more generally callers of the
> > exportfs code) that couldn't deal with i_generation in the file handle,
> > and therefore we invented this generation hack to work around the loss
> > of the generation information?
> > 
> > There's a comment in xfs_fs_encode_fh about not supporting 64bit inodes
> > with subtree_check (which seems to require one ino/gen pair for the file
> > and a second pair for the file's parent) on NFSv2 because v2 doesn't
> > provide enough space for all the file handle information, but that's the
> > furthest I got with lazy-mining the git history. :)
> 
> There's a comment in fs/ext4/super.c:ext4_nfs_get_inode
> 
> 	* Currently we don't know the generation for parent directory, so
> 	* a generation of 0 means "accept any"
> 
> But I don't see that used.
> 
> It was used once upon a time; I see it actually used in old 2.5 code in
> nfsd_get_dentry.  Hm.

Oh, maybe it's here in fs/libfs.c:generic_fh_to_parent:

	switch (fh_type) {
	case FILEID_INO32_GEN_PARENT:
		inode = get_inode(sb, fid->i32.parent_ino,
        			  (fh_len > 3 ? fid->i32.parent_gen : 0));
		break;
	}

I'm not sure under what conditions that filehandle encoding is used.

--b.
Darrick Wong July 4, 2017, 4:04 a.m. UTC | #11
On Thu, Jun 29, 2017 at 02:50:22PM -0400, J. Bruce Fields wrote:
> On Thu, Jun 29, 2017 at 02:30:53PM -0400, J. Bruce Fields wrote:
> > On Thu, Jun 29, 2017 at 10:25:28AM -0700, Darrick J. Wong wrote:
> > > Was there ever a version of NFS (or more generally callers of the
> > > exportfs code) that couldn't deal with i_generation in the file handle,
> > > and therefore we invented this generation hack to work around the loss
> > > of the generation information?
> > > 
> > > There's a comment in xfs_fs_encode_fh about not supporting 64bit inodes
> > > with subtree_check (which seems to require one ino/gen pair for the file
> > > and a second pair for the file's parent) on NFSv2 because v2 doesn't
> > > provide enough space for all the file handle information, but that's the
> > > furthest I got with lazy-mining the git history. :)
> > 
> > There's a comment in fs/ext4/super.c:ext4_nfs_get_inode
> > 
> > 	* Currently we don't know the generation for parent directory, so
> > 	* a generation of 0 means "accept any"
> > 
> > But I don't see that used.
> > 
> > It was used once upon a time; I see it actually used in old 2.5 code in
> > nfsd_get_dentry.  Hm.
> 
> Oh, maybe it's here in fs/libfs.c:generic_fh_to_parent:
> 
> 	switch (fh_type) {
> 	case FILEID_INO32_GEN_PARENT:
> 		inode = get_inode(sb, fid->i32.parent_ino,
>         			  (fh_len > 3 ? fid->i32.parent_gen : 0));
> 		break;
> 	}
> 
> I'm not sure under what conditions that filehandle encoding is used.

The best guess I can come up with is the old nfs_fhbase_old style handles,
which (afaict) do not carry parent i_generation?

--D

> 
> --b.
J. Bruce Fields July 5, 2017, 1:15 a.m. UTC | #12
On Mon, Jul 03, 2017 at 09:04:46PM -0700, Darrick J. Wong wrote:
> On Thu, Jun 29, 2017 at 02:50:22PM -0400, J. Bruce Fields wrote:
> > On Thu, Jun 29, 2017 at 02:30:53PM -0400, J. Bruce Fields wrote:
> > > On Thu, Jun 29, 2017 at 10:25:28AM -0700, Darrick J. Wong wrote:
> > > > Was there ever a version of NFS (or more generally callers of the
> > > > exportfs code) that couldn't deal with i_generation in the file handle,
> > > > and therefore we invented this generation hack to work around the loss
> > > > of the generation information?
> > > > 
> > > > There's a comment in xfs_fs_encode_fh about not supporting 64bit inodes
> > > > with subtree_check (which seems to require one ino/gen pair for the file
> > > > and a second pair for the file's parent) on NFSv2 because v2 doesn't
> > > > provide enough space for all the file handle information, but that's the
> > > > furthest I got with lazy-mining the git history. :)
> > > 
> > > There's a comment in fs/ext4/super.c:ext4_nfs_get_inode
> > > 
> > > 	* Currently we don't know the generation for parent directory, so
> > > 	* a generation of 0 means "accept any"
> > > 
> > > But I don't see that used.
> > > 
> > > It was used once upon a time; I see it actually used in old 2.5 code in
> > > nfsd_get_dentry.  Hm.
> > 
> > Oh, maybe it's here in fs/libfs.c:generic_fh_to_parent:
> > 
> > 	switch (fh_type) {
> > 	case FILEID_INO32_GEN_PARENT:
> > 		inode = get_inode(sb, fid->i32.parent_ino,
> >         			  (fh_len > 3 ? fid->i32.parent_gen : 0));
> > 		break;
> > 	}
> > 
> > I'm not sure under what conditions that filehandle encoding is used.
> 
> The best guess I can come up with is the old nfs_fhbase_old style handles,
> which (afaict) do not carry parent i_generation?

Yeah, I just couldn't tell in the time I looked whether they could still
be handed out.

If not, then the only way they'd still be used is if a client had a
server continually mounted while the server was upgraded from a kernel
that still handed out the old filehandle.

So if they haven't been given out for long enough it's possible nobody
would notice if we dropped support.

But, I didn't get far enough to figure that out.

--b.
Darrick Wong July 5, 2017, 7:19 p.m. UTC | #13
On Tue, Jul 04, 2017 at 09:15:34PM -0400, J. Bruce Fields wrote:
> On Mon, Jul 03, 2017 at 09:04:46PM -0700, Darrick J. Wong wrote:
> > On Thu, Jun 29, 2017 at 02:50:22PM -0400, J. Bruce Fields wrote:
> > > On Thu, Jun 29, 2017 at 02:30:53PM -0400, J. Bruce Fields wrote:
> > > > On Thu, Jun 29, 2017 at 10:25:28AM -0700, Darrick J. Wong wrote:
> > > > > Was there ever a version of NFS (or more generally callers of the
> > > > > exportfs code) that couldn't deal with i_generation in the file handle,
> > > > > and therefore we invented this generation hack to work around the loss
> > > > > of the generation information?
> > > > > 
> > > > > There's a comment in xfs_fs_encode_fh about not supporting 64bit inodes
> > > > > with subtree_check (which seems to require one ino/gen pair for the file
> > > > > and a second pair for the file's parent) on NFSv2 because v2 doesn't
> > > > > provide enough space for all the file handle information, but that's the
> > > > > furthest I got with lazy-mining the git history. :)
> > > > 
> > > > There's a comment in fs/ext4/super.c:ext4_nfs_get_inode
> > > > 
> > > > 	* Currently we don't know the generation for parent directory, so
> > > > 	* a generation of 0 means "accept any"
> > > > 
> > > > But I don't see that used.
> > > > 
> > > > It was used once upon a time; I see it actually used in old 2.5 code in
> > > > nfsd_get_dentry.  Hm.
> > > 
> > > Oh, maybe it's here in fs/libfs.c:generic_fh_to_parent:
> > > 
> > > 	switch (fh_type) {
> > > 	case FILEID_INO32_GEN_PARENT:
> > > 		inode = get_inode(sb, fid->i32.parent_ino,
> > >         			  (fh_len > 3 ? fid->i32.parent_gen : 0));
> > > 		break;
> > > 	}
> > > 
> > > I'm not sure under what conditions that filehandle encoding is used.
> > 
> > The best guess I can come up with is the old nfs_fhbase_old style handles,
> > which (afaict) do not carry parent i_generation?
> 
> Yeah, I just couldn't tell in the time I looked whether they could still
> be handed out.
> 
> If not, then the only way they'd still be used is if a client had a
> server continually mounted while the server was upgraded from a kernel
> that still handed out the old filehandle.
> 
> So if they haven't been given out for long enough it's possible nobody
> would notice if we dropped support.
> 
> But, I didn't get far enough to figure that out.

Hmm, so looking back through prehistory, Linux prior to 2.3.51 (11 March
2000) gave out the old dentry style fhandles.  After that, the kernel
only gave out the new style handles that we still use today.  In 2.4.6
(4 July 2001) the behavior was modified again to chain handle types,
i.e. if the client passed in an old style handle then it would get
another old style handle back.  The changelog for -pre9 says that this
was done for compatibility reasons.

So, what's the probability that there are clients out there that started
talking to a 2.2-based knfsd and will now want to talk to a modern 4.13
kernel seventeen years later?  (Do nfs handles persist across client
restarts/remounts?)

--D

> 
> --b.
Theodore Ts'o July 5, 2017, 8:27 p.m. UTC | #14
On Wed, Jul 05, 2017 at 12:19:33PM -0700, Darrick J. Wong wrote:
> 
> So, what's the probability that there are clients out there that started
> talking to a 2.2-based knfsd and will now want to talk to a modern 4.13
> kernel seventeen years later?  (Do nfs handles persist across client
> restarts/remounts?)

It's whether or not nfs handles persist across server restarts which
would be the more interesting question.  So if you had a NAS box that
was using a Linux 2.2 kernel, and you had clients access the box, and
then that box gets upgraded to use a 4.13 kernel, what happens?

In the ideal world, the client wouldn't notice, and its 2.2-based file
handles that it obtained while the 2.2 kernel was running would
continue to work after the box came back up running the new 4.13
kernel.

To be honest, I'm not sure I care that much, but I don't use NFS much
if at all these days myself.  And in reality, what's the chance that
an NAS box vendor would continue to support a box that is 17 years old
and provide an upgrade for it?  (OK, everyone can stop laughing now. :-)

    	       	       	   	- Ted
J. Bruce Fields July 5, 2017, 8:49 p.m. UTC | #15
On Wed, Jul 05, 2017 at 12:19:33PM -0700, Darrick J. Wong wrote:
> On Tue, Jul 04, 2017 at 09:15:34PM -0400, J. Bruce Fields wrote:
> > On Mon, Jul 03, 2017 at 09:04:46PM -0700, Darrick J. Wong wrote:
> > > On Thu, Jun 29, 2017 at 02:50:22PM -0400, J. Bruce Fields wrote:
> > > > On Thu, Jun 29, 2017 at 02:30:53PM -0400, J. Bruce Fields wrote:
> > > > > On Thu, Jun 29, 2017 at 10:25:28AM -0700, Darrick J. Wong wrote:
> > > > > > Was there ever a version of NFS (or more generally callers of the
> > > > > > exportfs code) that couldn't deal with i_generation in the file handle,
> > > > > > and therefore we invented this generation hack to work around the loss
> > > > > > of the generation information?
> > > > > > 
> > > > > > There's a comment in xfs_fs_encode_fh about not supporting 64bit inodes
> > > > > > with subtree_check (which seems to require one ino/gen pair for the file
> > > > > > and a second pair for the file's parent) on NFSv2 because v2 doesn't
> > > > > > provide enough space for all the file handle information, but that's the
> > > > > > furthest I got with lazy-mining the git history. :)
> > > > > 
> > > > > There's a comment in fs/ext4/super.c:ext4_nfs_get_inode
> > > > > 
> > > > > 	* Currently we don't know the generation for parent directory, so
> > > > > 	* a generation of 0 means "accept any"
> > > > > 
> > > > > But I don't see that used.
> > > > > 
> > > > > It was used once upon a time; I see it actually used in old 2.5 code in
> > > > > nfsd_get_dentry.  Hm.
> > > > 
> > > > Oh, maybe it's here in fs/libfs.c:generic_fh_to_parent:
> > > > 
> > > > 	switch (fh_type) {
> > > > 	case FILEID_INO32_GEN_PARENT:
> > > > 		inode = get_inode(sb, fid->i32.parent_ino,
> > > >         			  (fh_len > 3 ? fid->i32.parent_gen : 0));
> > > > 		break;
> > > > 	}
> > > > 
> > > > I'm not sure under what conditions that filehandle encoding is used.
> > > 
> > > The best guess I can come up with is the old nfs_fhbase_old style handles,
> > > which (afaict) do not carry parent i_generation?
> > 
> > Yeah, I just couldn't tell in the time I looked whether they could still
> > be handed out.
> > 
> > If not, then the only way they'd still be used is if a client had a
> > server continually mounted while the server was upgraded from a kernel
> > that still handed out the old filehandle.
> > 
> > So if they haven't been given out for long enough it's possible nobody
> > would notice if we dropped support.
> > 
> > But, I didn't get far enough to figure that out.
> 
> Hmm, so looking back through prehistory, Linux prior to 2.3.51 (11 March
> 2000) gave out the old dentry style fhandles.  After that, the kernel
> only gave out the new style handles that we still use today.  In 2.4.6
> (4 July 2001) the behavior was modified again to chain handle types,
> i.e. if the client passed in an old style handle then it would get
> another old style handle back.  The changelog for -pre9 says that this
> was done for compatibility reasons.

Yeah, you're supposed to be able to reboot your NFS server for a kernel
upgrade without your client applications experiencing anything worse
than a temporary hang while you wait for the server to come back up.
So, changing the filehandle format and returning ESTALE to everyone
would be unpopular.

> So, what's the probability that there are clients out there that started
> talking to a 2.2-based knfsd and will now want to talk to a modern 4.13
> kernel seventeen years later?

I think it's unlikely enough that we could drop that code; cc'ing Neil
in case we overlooked anything.

> (Do nfs handles persist across client restarts/remounts?)

No.

(Well, with maybe a couple exceptions (fscache and persistent NFSv4
delegations) but neither seem relevant here.)

--b.
NeilBrown July 6, 2017, 1:08 a.m. UTC | #16
On Wed, Jul 05 2017, J. Bruce Fields wrote:

> On Wed, Jul 05, 2017 at 12:19:33PM -0700, Darrick J. Wong wrote:
>> On Tue, Jul 04, 2017 at 09:15:34PM -0400, J. Bruce Fields wrote:
>> > On Mon, Jul 03, 2017 at 09:04:46PM -0700, Darrick J. Wong wrote:
>> > > On Thu, Jun 29, 2017 at 02:50:22PM -0400, J. Bruce Fields wrote:
>> > > > On Thu, Jun 29, 2017 at 02:30:53PM -0400, J. Bruce Fields wrote:
>> > > > > On Thu, Jun 29, 2017 at 10:25:28AM -0700, Darrick J. Wong wrote:
>> > > > > > Was there ever a version of NFS (or more generally callers of the
>> > > > > > exportfs code) that couldn't deal with i_generation in the file handle,
>> > > > > > and therefore we invented this generation hack to work around the loss
>> > > > > > of the generation information?
>> > > > > > 
>> > > > > > There's a comment in xfs_fs_encode_fh about not supporting 64bit inodes
>> > > > > > with subtree_check (which seems to require one ino/gen pair for the file
>> > > > > > and a second pair for the file's parent) on NFSv2 because v2 doesn't
>> > > > > > provide enough space for all the file handle information, but that's the
>> > > > > > furthest I got with lazy-mining the git history. :)
>> > > > > 
>> > > > > There's a comment in fs/ext4/super.c:ext4_nfs_get_inode
>> > > > > 
>> > > > > 	* Currently we don't know the generation for parent directory, so
>> > > > > 	* a generation of 0 means "accept any"
>> > > > > 
>> > > > > But I don't see that used.
>> > > > > 
>> > > > > It was used once upon a time; I see it actually used in old 2.5 code in
>> > > > > nfsd_get_dentry.  Hm.
>> > > > 
>> > > > Oh, maybe it's here in fs/libfs.c:generic_fh_to_parent:
>> > > > 
>> > > > 	switch (fh_type) {
>> > > > 	case FILEID_INO32_GEN_PARENT:
>> > > > 		inode = get_inode(sb, fid->i32.parent_ino,
>> > > >         			  (fh_len > 3 ? fid->i32.parent_gen : 0));
>> > > > 		break;
>> > > > 	}
>> > > > 
>> > > > I'm not sure under what conditions that filehandle encoding is used.
>> > > 
>> > > The best guess I can come up with is the old nfs_fhbase_old style handles,
>> > > which (afaict) do not carry parent i_generation?
>> > 
>> > Yeah, I just couldn't tell in the time I looked whether they could still
>> > be handed out.
>> > 
>> > If not, then the only way they'd still be used is if a client had a
>> > server continually mounted while the server was upgraded from a kernel
>> > that still handed out the old filehandle.
>> > 
>> > So if they haven't been given out for long enough it's possible nobody
>> > would notice if we dropped support.
>> > 
>> > But, I didn't get far enough to figure that out.
>> 
>> Hmm, so looking back through prehistory, Linux prior to 2.3.51 (11 March
>> 2000) gave out the old dentry style fhandles.  After that, the kernel
>> only gave out the new style handles that we still use today.  In 2.4.6
>> (4 July 2001) the behavior was modified again to chain handle types,
>> i.e. if the client passed in an old style handle then it would get
>> another old style handle back.  The changelog for -pre9 says that this
>> was done for compatibility reasons.
>
> Yeah, you're supposed to be able to reboot your NFS server for a kernel
> upgrade without your client applications experiencing anything worse
> than a temporary hang while you wait for the server to come back up.
> So, changing the filehandle format and returning ESTALE to everyone
> would be unpopular.
>
>> So, what's the probability that there are clients out there that started
>> talking to a 2.2-based knfsd and will now want to talk to a modern 4.13
>> kernel seventeen years later?
>
> I think it's unlikely enough that we could drop that code; cc'ing Neil
> in case we overlooked anything.

While I remain a fan of maintaining forward/backward compatibility as
much as possible, 15 years is probably more than I can realistically
hope for.
As you say, a generation number of '0' is only special when old-style
file handles are used, with the "subtree_check" export option.  They are
unlikely to have been used recently.

However, I note that include/linux/exportfs.h says:

	/*
	 * 32bit inode number, 32 bit generation number,
	 * 32 bit parent directory inode number.
	 */
	FILEID_INO32_GEN_PARENT = 2,

This could be seen as misleading.
Some code that reports that fid_type includes the directory generation
number.  Some other code (cephfs, squashfs) doesn't even include the
generation number for the inode (which is OK for squashfs as it is
write-only).  I could find no code that matches this documentation.

I was never a fan of having generic fid_types.  Except for 0 and 255,
these numbers are generated and interpreted by individual filesystems,
and there is no reason that they should agree on the interpretation (and
as we see here, they don't).

But for the main point of your question: I see no problem with removing
nfs_fhbase_old and related code, and that includes the special handling
of generation number zero.

Thanks,
NeilBrown

>
>> (Do nfs handles persist across client restarts/remounts?)
>
> No.
>
> (Well, with maybe a couple exceptions (fscache and persistent NFSv4
> delegations) but neither seem relevant here.)
>
> --b.
J. Bruce Fields July 6, 2017, 2:39 a.m. UTC | #17
On Thu, Jul 06, 2017 at 11:08:27AM +1000, NeilBrown wrote:
> On Wed, Jul 05 2017, J. Bruce Fields wrote:
> 
> > On Wed, Jul 05, 2017 at 12:19:33PM -0700, Darrick J. Wong wrote:
> >> So, what's the probability that there are clients out there that started
> >> talking to a 2.2-based knfsd and will now want to talk to a modern 4.13
> >> kernel seventeen years later?
> >
> > I think it's unlikely enough that we could drop that code;

Wow, that was a terrible sentence.  What I meant was: I think it's
unlikely that such a client exists, therefore I'm OK with dropping that
code.

Anyway:

> cc'ing Neil
> > in case we overlooked anything.
> 
> While I remain a fan of maintaining forward/backward compatibility as
> much as possible, 15 years is probably more than I can realistically
> hope for.
> As you say, a generation number of '0' is only special when old-style
> file handles are used, with the "subtree_check" export option.  They are
> unlikely to have been used recently.
...
> But for the main point of your question: I see no problem with removing
> nfs_fhbase_old and related code, and that includes the special handling
> of generation number zero.

So, we agree, OK.

I dunno if this is actually urgent.  But it'd be nice to clean out.

--b.
Jeff Layton July 7, 2017, 10:51 a.m. UTC | #18
On Wed, 2017-07-05 at 16:27 -0400, Theodore Ts'o wrote:
> On Wed, Jul 05, 2017 at 12:19:33PM -0700, Darrick J. Wong wrote:
> > 
> > So, what's the probability that there are clients out there that started
> > talking to a 2.2-based knfsd and will now want to talk to a modern 4.13
> > kernel seventeen years later?  (Do nfs handles persist across client
> > restarts/remounts?)
> 
> It's whether or not nfs handles persist across server restarts which
> would be the more interesting question.  So if you had a NAS box that
> was using a Linux 2.2 kernel, and you had clients access the box, and
> then that box gets upgraded to use a 4.13 kernel, what happens?
> 
> In the ideal world, the client wouldn't notice, and its 2.2-based file
> handles that it obtained while the 2.2 kernel was running would
> continue to work after the box came back up running the new 4.13
> kernel.
> 

Right. That's the case today if we don't remove support for old
filehandles. If we were to remove them, the clients would get back
-ESTALE there if they tried to use the old 2.2-style fh's that they saw
before the upgrade.

The main takeaway here is that NFS filehandle lifetime is really only
bounded by the boot time of the oldest clients.

> To be honest, I'm not sure I care that much, but I don't use NFS much
> if at all these days myself.  And in reality, what's the chance that
> an NAS box vendor would continue to support a box that is 17 years old
> and provide an upgrade for it?  (OK, everyone can stop laughing now. :-)

Agreed. I think we're safe enough to remove this support.

If we were really concerned about it, we could move support for it under
a Kconfig option that defaults to "off" for now, and then we could plan
to just drop it in a couple of years.

Vendors that thought they might need it could enable it and speak up
with their use case if they wanted to keep it in for future releases.
Most likely though, no-one will care.
Theodore Ts'o July 7, 2017, 3:51 p.m. UTC | #19
On Fri, Jul 07, 2017 at 06:51:37AM -0400, Jeff Layton wrote:
> 
> Right. That's the case today if we don't remove support for old
> filehandles. If we were to remove them, the clients would get back
> -ESTALE there if they tried to use the old 2.2-style fh's that they saw
> before the upgrade.
> 
> The main takeaway here is that NFS filehandle lifetime is really only
> bounded by the boot time of the oldest clients.

Well, and how long an NFS server is still up.  So one could construct
a use case where a (hypothetical) system administrator had a RHEL 7.0
system with a 2.2.16-22 kernel, and they try to update it to a
(hypothetical) RHEL 10 kernel in one fell swoop with a 4.13+ kernel
that no longer supports the 2-2-style fh's.  A client that had the
server mounted when it was running the 2.2 kernel might only be up for
a few hours, before the upgrade to RHEL 10 happened, and then the
client would get ESTALE errors.

Of course, I've stopped carrying about enterprise kernel support a
long time ago, so I just think that scenario is funny.  I recognize
that folks who work at Red Hat have to worry about such things --- and
I'm sorry.  :-)

In reality a server installed with RHEL 7.0 has probably died of old
age by now --- unless someone crazy is running it in a VMware VM
because they had some enterprise software package or some bar-code
printing module for which they don't have source code[1], and so they are
stuck on RHEL 7.0, even in 2017.  Have I mentioned I'm so glad I don't
have to worry these sorts of things any more?

					- Ted

[1] That wasn't a made up example; I once visited a customer on site,
back in the day, that had that exact problem, and so they were stuck
on some antique version of RHEL, and they expected me to help them.
Jeff Layton July 7, 2017, 4:13 p.m. UTC | #20
On Fri, 2017-07-07 at 11:51 -0400, Theodore Ts'o wrote:
> On Fri, Jul 07, 2017 at 06:51:37AM -0400, Jeff Layton wrote:
> > 
> > Right. That's the case today if we don't remove support for old
> > filehandles. If we were to remove them, the clients would get back
> > -ESTALE there if they tried to use the old 2.2-style fh's that they saw
> > before the upgrade.
> > 
> > The main takeaway here is that NFS filehandle lifetime is really only
> > bounded by the boot time of the oldest clients.
> 
> Well, and how long an NFS server is still up.  So one could construct
> a use case where a (hypothetical) system administrator had a RHEL 7.0
> system with a 2.2.16-22 kernel, and they try to update it to a
> (hypothetical) RHEL 10 kernel in one fell swoop with a 4.13+ kernel
> that no longer supports the 2-2-style fh's.  A client that had the
> server mounted when it was running the 2.2 kernel might only be up for
> a few hours, before the upgrade to RHEL 10 happened, and then the
> client would get ESTALE errors.
> 
> Of course, I've stopped carrying about enterprise kernel support a
> long time ago, so I just think that scenario is funny.  I recognize
> that folks who work at Red Hat have to worry about such things --- and
> I'm sorry.  :-)
> 
> In reality a server installed with RHEL 7.0 has probably died of old
> age by now --- unless someone crazy is running it in a VMware VM
> because they had some enterprise software package or some bar-code
> printing module for which they don't have source code[1], and so they are
> stuck on RHEL 7.0, even in 2017.  Have I mentioned I'm so glad I don't
> have to worry these sorts of things any more?
> 
> 					- Ted
> 
> [1] That wasn't a made up example; I once visited a customer on site,
> back in the day, that had that exact problem, and so they were stuck
> on some antique version of RHEL, and they expected me to help them.

Yep, exactly. An abrupt upgrade like that is always a possibility, but
it's pretty unlikely, and I don't have a ton of sympathy for anyone who
does that.

I guess another thing we could do as an interim step is to add a
scary-looking printk that fires when someone sends us one of these
really old-style filehandles.

That won't help the poor bastard who updates the host directly from
v2.2-era kernel to the version that eventually removes support for the
old filehandles. It might give us an idea of whether there are still
clients in the field that are still using them though.
J. Bruce Fields July 7, 2017, 4:47 p.m. UTC | #21
On Fri, Jul 07, 2017 at 12:13:36PM -0400, Jeff Layton wrote:
> On Fri, 2017-07-07 at 11:51 -0400, Theodore Ts'o wrote:
> > On Fri, Jul 07, 2017 at 06:51:37AM -0400, Jeff Layton wrote:
> > > 
> > > Right. That's the case today if we don't remove support for old
> > > filehandles. If we were to remove them, the clients would get back
> > > -ESTALE there if they tried to use the old 2.2-style fh's that they saw
> > > before the upgrade.
> > > 
> > > The main takeaway here is that NFS filehandle lifetime is really only
> > > bounded by the boot time of the oldest clients.
> > 
> > Well, and how long an NFS server is still up.  So one could construct
> > a use case where a (hypothetical) system administrator had a RHEL 7.0
> > system with a 2.2.16-22 kernel, and they try to update it to a
> > (hypothetical) RHEL 10 kernel in one fell swoop with a 4.13+ kernel
> > that no longer supports the 2-2-style fh's.  A client that had the
> > server mounted when it was running the 2.2 kernel might only be up for
> > a few hours, before the upgrade to RHEL 10 happened, and then the
> > client would get ESTALE errors.
> > 
> > Of course, I've stopped carrying about enterprise kernel support a
> > long time ago, so I just think that scenario is funny.  I recognize
> > that folks who work at Red Hat have to worry about such things --- and
> > I'm sorry.  :-)
> > 
> > In reality a server installed with RHEL 7.0 has probably died of old
> > age by now --- unless someone crazy is running it in a VMware VM
> > because they had some enterprise software package or some bar-code
> > printing module for which they don't have source code[1], and so they are
> > stuck on RHEL 7.0, even in 2017.  Have I mentioned I'm so glad I don't
> > have to worry these sorts of things any more?

RHEL 7 is current, I think you mean the 17-year-old Red Hat Linux 7.
Anyone that far back is on their own as far as any enterprise distro is
concerned.

There are some exceptions to the "lifetime of a mount" rule, none real
issues, I think:

	- fscache may keep fh's around across client boots, but I
	  suspect you just lose the benefit of the cache until it
	  expires data keyed under old filehandles and repopulates the
	  cache with new ones.

	- Does the client actually depend on stable filehandles across
	  client reboots if it might cache write data under a persistent
	  delegation?  But seeing as we don't even implement persistent
	  delegations, this is a non-issue.

	- nontraditional NFS clients could do any random thing.  NFS is
	  just a protocol, we have no idea how some weird application
	  that talks NFS directly to the server might use filehandles.
	  But this is purely hypothetical, I don't know of one.

--b.
diff mbox

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3219154..74c6677 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1549,6 +1549,14 @@  static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
 
+static inline void ext4_inode_set_gen(struct inode *inode,
+				      struct ext4_sb_info *sbi)
+{
+	inode->i_generation = sbi->s_next_generation++;
+	if (!inode->i_generation)
+		inode->i_generation = sbi->s_next_generation++;
+}
+
 /*
  * Inode dynamic state flags
  */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 98ac2f1..d33f6f0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1072,7 +1072,7 @@  struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 		goto out;
 	}
 	spin_lock(&sbi->s_next_gen_lock);
-	inode->i_generation = sbi->s_next_generation++;
+	ext4_inode_set_gen(inode, sbi);
 	spin_unlock(&sbi->s_next_gen_lock);
 
 	/* Precompute checksum seed for inode metadata */
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0c21e22..d52a467 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -160,8 +160,8 @@  static long swap_inode_boot_loader(struct super_block *sb,
 	inode->i_ctime = inode_bl->i_ctime = current_time(inode);
 
 	spin_lock(&sbi->s_next_gen_lock);
-	inode->i_generation = sbi->s_next_generation++;
-	inode_bl->i_generation = sbi->s_next_generation++;
+	ext4_inode_set_gen(inode, sbi);
+	ext4_inode_set_gen(inode_bl, sbi);
 	spin_unlock(&sbi->s_next_gen_lock);
 
 	ext4_discard_preallocations(inode);