Patchwork [1/2] ext4: Calculate and verify inode checksums

login
register
mail settings
Submitter Darrick J. Wong
Date April 6, 2011, 10:45 p.m.
Message ID <20110406224547.GT32706@tux1.beaverton.ibm.com>
Download mbox | patch
Permalink /patch/90099/
State Superseded
Headers show

Comments

Darrick J. Wong - April 6, 2011, 10:45 p.m.
This patch introduces to ext4 the ability to calculate and verify inode
checksums.  This requires the use of a new ro compatibility flag and some
accompanying e2fsprogs patches to provide the relevant features in tune2fs and
e2fsck.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
---

 fs/ext4/ext4.h  |    6 ++++--
 fs/ext4/inode.c |   52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 55 insertions(+), 3 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sunil Mushran - April 7, 2011, 12:52 a.m.
On 04/06/2011 03:45 PM, Darrick J. Wong wrote:
> This patch introduces to ext4 the ability to calculate and verify inode
> checksums.  This requires the use of a new ro compatibility flag and some
> accompanying e2fsprogs patches to provide the relevant features in tune2fs and
> e2fsck.
>
> Signed-off-by: Darrick J. Wong<djwong@us.ibm.com>
> ---
>
>   fs/ext4/ext4.h  |    6 ++++--
>   fs/ext4/inode.c |   52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>   2 files changed, 55 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 4daaf2b..8815928 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -617,7 +617,7 @@ struct ext4_inode {
>   		} masix2;
>   	} osd2;				/* OS dependent 2 */
>   	__le16	i_extra_isize;
> -	__le16	i_pad1;
> +	__le16  i_checksum;		/* crc16(sb_uuid+inodenum+inode) */
>   	__le32  i_ctime_extra;  /* extra Change time      (nsec<<  2 | epoch) */
>   	__le32  i_mtime_extra;  /* extra Modification time(nsec<<  2 | epoch) */
>   	__le32  i_atime_extra;  /* extra Access time      (nsec<<  2 | epoch) */
> @@ -1338,6 +1338,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
>   #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM		0x0010
>   #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK	0x0020
>   #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE	0x0040
> +#define EXT4_FEATURE_RO_COMPAT_INODE_CSUM	0x0400
>
>   #define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
>   #define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
> @@ -1364,7 +1365,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
>   					 EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
>   					 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
>   					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
> -					 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)
> +					 EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
> +					 EXT4_FEATURE_RO_COMPAT_INODE_CSUM)
>
>   /*
>    * Default values for user and/or group using reserved blocks
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 1a86282..dc76f19 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -42,6 +42,7 @@
>   #include<linux/printk.h>
>   #include<linux/slab.h>
>   #include<linux/ratelimit.h>
> +#include<linux/crc16.h>
>
>   #include "ext4_jbd2.h"
>   #include "xattr.h"
> @@ -52,6 +53,40 @@
>
>   #define MPAGE_DA_EXTENT_TAIL 0x01
>
> +static __le16 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw)
> +{
> +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> +	struct ext4_inode_info *ei = EXT4_I(inode);
> +	int offset = offsetof(struct ext4_inode, i_checksum);
> +	__le32 inum = cpu_to_le32(inode->i_ino);
> +	__u16 crc = 0;
> +
> +	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
> +		EXT4_FEATURE_RO_COMPAT_INODE_CSUM)&&
> +	    le16_to_cpu(raw->i_extra_isize)>= 4) {
> +		crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
> +		crc = crc16(crc, (__u8 *)&inum, sizeof(inum));
> +		crc = crc16(crc, (__u8 *)raw, offset);
> +		offset += sizeof(raw->i_checksum); /* skip checksum */
> +		/* for checksum of struct ext4_inode do the rest...*/
> +		if (ei->i_extra_isize>  4)
> +			crc = crc16(crc, (__u8 *)raw + offset,
> +				    ei->i_extra_isize - 4);
> +	}
> +
> +	return cpu_to_le16(crc);
> +}
> +
> +static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw)
> +{
> +	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
> +		EXT4_FEATURE_RO_COMPAT_INODE_CSUM)&&
> +	    (raw->i_checksum != ext4_inode_csum(inode, raw)))
> +		return 0;
> +
> +	return 1;
> +}
> +
>   static inline int ext4_begin_ordered_truncate(struct inode *inode,
>   					      loff_t new_size)
>   {
> @@ -4802,7 +4837,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>   	struct ext4_inode *raw_inode;
>   	struct ext4_inode_info *ei;
>   	struct inode *inode;
> -	journal_t *journal = EXT4_SB(sb)->s_journal;
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	journal_t *journal = sbi->s_journal;
>   	long ret;
>   	int block;
>
> @@ -4916,6 +4952,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>   	} else
>   		ei->i_extra_isize = 0;
>
> +	if (!ext4_inode_csum_verify(inode, raw_inode)) {
> +		EXT4_ERROR_INODE(inode, "checksum invalid (%u != %u)",
> +		       le16_to_cpu(ext4_inode_csum(inode, raw_inode)),
> +		       le16_to_cpu(raw_inode->i_checksum));
> +		ret = -EIO;
> +		goto bad_inode;
> +	}
> +
>   	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
>   	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
>   	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
> @@ -5138,6 +5182,12 @@ static int ext4_do_update_inode(handle_t *handle,
>   			raw_inode->i_version_hi =
>   			cpu_to_le32(inode->i_version>>  32);
>   		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
> +
> +		if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
> +			EXT4_FEATURE_RO_COMPAT_INODE_CSUM)&&
> +		    EXT4_FITS_IN_INODE(raw_inode, ei, i_checksum))
> +			raw_inode->i_checksum =
> +				ext4_inode_csum(inode, raw_inode);
>   	}
>
>   	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");

You may want to look into jbd2 buffer triggers. struct jbd2_buffer_trigger_type
Instead of computing checksum on every update, it allows one to compute it
just before the journal write. More efficient.

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong - April 7, 2011, 4:40 p.m.
On Wed, Apr 06, 2011 at 05:52:43PM -0700, Sunil Mushran wrote:
> On 04/06/2011 03:45 PM, Darrick J. Wong wrote:
>> This patch introduces to ext4 the ability to calculate and verify inode
>> checksums.  This requires the use of a new ro compatibility flag and some
>> accompanying e2fsprogs patches to provide the relevant features in tune2fs and
>> e2fsck.
>>
>> Signed-off-by: Darrick J. Wong<djwong@us.ibm.com>
>> ---
>>
>>   fs/ext4/ext4.h  |    6 ++++--
>>   fs/ext4/inode.c |   52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>>   2 files changed, 55 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 4daaf2b..8815928 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -617,7 +617,7 @@ struct ext4_inode {
>>   		} masix2;
>>   	} osd2;				/* OS dependent 2 */
>>   	__le16	i_extra_isize;
>> -	__le16	i_pad1;
>> +	__le16  i_checksum;		/* crc16(sb_uuid+inodenum+inode) */
>>   	__le32  i_ctime_extra;  /* extra Change time      (nsec<<  2 | epoch) */
>>   	__le32  i_mtime_extra;  /* extra Modification time(nsec<<  2 | epoch) */
>>   	__le32  i_atime_extra;  /* extra Access time      (nsec<<  2 | epoch) */
>> @@ -1338,6 +1338,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
>>   #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM		0x0010
>>   #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK	0x0020
>>   #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE	0x0040
>> +#define EXT4_FEATURE_RO_COMPAT_INODE_CSUM	0x0400
>>
>>   #define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
>>   #define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
>> @@ -1364,7 +1365,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
>>   					 EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
>>   					 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
>>   					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
>> -					 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)
>> +					 EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
>> +					 EXT4_FEATURE_RO_COMPAT_INODE_CSUM)
>>
>>   /*
>>    * Default values for user and/or group using reserved blocks
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 1a86282..dc76f19 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -42,6 +42,7 @@
>>   #include<linux/printk.h>
>>   #include<linux/slab.h>
>>   #include<linux/ratelimit.h>
>> +#include<linux/crc16.h>
>>
>>   #include "ext4_jbd2.h"
>>   #include "xattr.h"
>> @@ -52,6 +53,40 @@
>>
>>   #define MPAGE_DA_EXTENT_TAIL 0x01
>>
>> +static __le16 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw)
>> +{
>> +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>> +	struct ext4_inode_info *ei = EXT4_I(inode);
>> +	int offset = offsetof(struct ext4_inode, i_checksum);
>> +	__le32 inum = cpu_to_le32(inode->i_ino);
>> +	__u16 crc = 0;
>> +
>> +	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
>> +		EXT4_FEATURE_RO_COMPAT_INODE_CSUM)&&
>> +	    le16_to_cpu(raw->i_extra_isize)>= 4) {
>> +		crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
>> +		crc = crc16(crc, (__u8 *)&inum, sizeof(inum));
>> +		crc = crc16(crc, (__u8 *)raw, offset);
>> +		offset += sizeof(raw->i_checksum); /* skip checksum */
>> +		/* for checksum of struct ext4_inode do the rest...*/
>> +		if (ei->i_extra_isize>  4)
>> +			crc = crc16(crc, (__u8 *)raw + offset,
>> +				    ei->i_extra_isize - 4);
>> +	}
>> +
>> +	return cpu_to_le16(crc);
>> +}
>> +
>> +static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw)
>> +{
>> +	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
>> +		EXT4_FEATURE_RO_COMPAT_INODE_CSUM)&&
>> +	    (raw->i_checksum != ext4_inode_csum(inode, raw)))
>> +		return 0;
>> +
>> +	return 1;
>> +}
>> +
>>   static inline int ext4_begin_ordered_truncate(struct inode *inode,
>>   					      loff_t new_size)
>>   {
>> @@ -4802,7 +4837,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>>   	struct ext4_inode *raw_inode;
>>   	struct ext4_inode_info *ei;
>>   	struct inode *inode;
>> -	journal_t *journal = EXT4_SB(sb)->s_journal;
>> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
>> +	journal_t *journal = sbi->s_journal;
>>   	long ret;
>>   	int block;
>>
>> @@ -4916,6 +4952,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>>   	} else
>>   		ei->i_extra_isize = 0;
>>
>> +	if (!ext4_inode_csum_verify(inode, raw_inode)) {
>> +		EXT4_ERROR_INODE(inode, "checksum invalid (%u != %u)",
>> +		       le16_to_cpu(ext4_inode_csum(inode, raw_inode)),
>> +		       le16_to_cpu(raw_inode->i_checksum));
>> +		ret = -EIO;
>> +		goto bad_inode;
>> +	}
>> +
>>   	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
>>   	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
>>   	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
>> @@ -5138,6 +5182,12 @@ static int ext4_do_update_inode(handle_t *handle,
>>   			raw_inode->i_version_hi =
>>   			cpu_to_le32(inode->i_version>>  32);
>>   		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
>> +
>> +		if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
>> +			EXT4_FEATURE_RO_COMPAT_INODE_CSUM)&&
>> +		    EXT4_FITS_IN_INODE(raw_inode, ei, i_checksum))
>> +			raw_inode->i_checksum =
>> +				ext4_inode_csum(inode, raw_inode);
>>   	}
>>
>>   	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
>
> You may want to look into jbd2 buffer triggers. struct jbd2_buffer_trigger_type
> Instead of computing checksum on every update, it allows one to compute it
> just before the journal write. More efficient.

Yes, I see that jbd2 has triggers, looks like a nifty feature.  I suppose if I
went with that approach I'd still have to calculate the checksum in
ext4_do_update_inode in the nojournal case, and in the journal case I'd write a
trigger that would figure out which inodes in a given buffer are actually dirty
and compute their checksums.

That said, I haven't really quantified the performance impact of this naive
approach yet, so I wonder -- did you see a similar scenario with ocfs2, and
what kind of performance increase did you get by adapting the code to use the
jbd2 trigger?  If there's potentially a large increase, it would be interesting
to apply the same conversion to the group descriptor checksumming code too.

--D
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sunil Mushran - April 7, 2011, 5:10 p.m.
On 04/07/2011 09:40 AM, Darrick J. Wong wrote:
> Yes, I see that jbd2 has triggers, looks like a nifty feature.  I suppose if I
> went with that approach I'd still have to calculate the checksum in
> ext4_do_update_inode in the nojournal case, and in the journal case I'd write a
> trigger that would figure out which inodes in a given buffer are actually dirty
> and compute their checksums.

Yes, you could use a in-mem flag (set in update, cleared in trigger) to
identify dirty inodes. The discussion on stable pages could be relevant for
the nojournal case.

>
> That said, I haven't really quantified the performance impact of this naive
> approach yet, so I wonder -- did you see a similar scenario with ocfs2, and
> what kind of performance increase did you get by adapting the code to use the
> jbd2 trigger?  If there's potentially a large increase, it would be interesting
> to apply the same conversion to the group descriptor checksumming code too.

Joel Becker may remember the overhead. He wrote the patch. That said we have few
differences. ocfs2 has larger (blocksized) inodes. Also, it computes ECC. The code
is in fs/ocfs2/blockcheck.c.

As in, you may want to generate ext4 specific numbers.
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger - April 8, 2011, 8:58 a.m.
On 2011-04-06, at 4:45 PM, Darrick J. Wong wrote:
> This patch introduces to ext4 the ability to calculate and verify inode
> checksums.  This requires the use of a new ro compatibility flag and some
> accompanying e2fsprogs patches to provide the relevant features in tune2fs and
> e2fsck.
> 
> Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
> ---
> 
> fs/ext4/ext4.h  |    6 ++++--
> fs/ext4/inode.c |   52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 55 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 4daaf2b..8815928 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -617,7 +617,7 @@ struct ext4_inode {
> 		} masix2;
> 	} osd2;				/* OS dependent 2 */
> 	__le16	i_extra_isize;
> -	__le16	i_pad1;
> +	__le16  i_checksum;		/* crc16(sb_uuid+inodenum+inode) */

In previous discussions, we thought about using part/all of the l_i_reserved2 field to store the inode checksum.  The inode checksum is important enough to warrant a field in the core inode, so that it also works on upgraded filesystems.

> +static __le16 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw)
> +{
> +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> +	struct ext4_inode_info *ei = EXT4_I(inode);
> +	int offset = offsetof(struct ext4_inode, i_checksum);
> +	__le32 inum = cpu_to_le32(inode->i_ino);
> +	__u16 crc = 0;
> +
> +	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
> +		EXT4_FEATURE_RO_COMPAT_INODE_CSUM) &&
> +	    le16_to_cpu(raw->i_extra_isize) >= 4) {

If this field remains in the large part of the inode, then this check is incorrect.  i_extra_isize is itself only valid if (s_inode_size is > EXT4_GOOD_OLD_INODE_SIZE), otherwise it points at the next inode's i_mode.

Also, instead of hard-coding ">= 4" here, it would be better to use EXT4_FITS_IN_INODE(raw, EXT4_I(inode), i_checksum).  Probably no reason to put "ei" on the stack at all.

> +		crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
> +		crc = crc16(crc, (__u8 *)&inum, sizeof(inum));
> +		crc = crc16(crc, (__u8 *)raw, offset);
> +		offset += sizeof(raw->i_checksum); /* skip checksum */
> +		/* for checksum of struct ext4_inode do the rest...*/
> +		if (ei->i_extra_isize > 4)
> +			crc = crc16(crc, (__u8 *)raw + offset,
> +				    ei->i_extra_isize - 4);


> +	}
> +
> +	return cpu_to_le16(crc);
> +}
> +
> +static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw)
> +{
> +	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
> +		EXT4_FEATURE_RO_COMPAT_INODE_CSUM) &&
> +	    (raw->i_checksum != ext4_inode_csum(inode, raw)))
> +		return 0;
> +
> +	return 1;
> +}
> +
> static inline int ext4_begin_ordered_truncate(struct inode *inode,
> 					      loff_t new_size)
> {
> @@ -4802,7 +4837,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
> 	struct ext4_inode *raw_inode;
> 	struct ext4_inode_info *ei;
> 	struct inode *inode;
> -	journal_t *journal = EXT4_SB(sb)->s_journal;
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	journal_t *journal = sbi->s_journal;
> 	long ret;
> 	int block;
> 
> @@ -4916,6 +4952,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
> 	} else
> 		ei->i_extra_isize = 0;
> 
> +	if (!ext4_inode_csum_verify(inode, raw_inode)) {
> +		EXT4_ERROR_INODE(inode, "checksum invalid (%u != %u)",
> +		       le16_to_cpu(ext4_inode_csum(inode, raw_inode)),
> +		       le16_to_cpu(raw_inode->i_checksum));
> +		ret = -EIO;
> +		goto bad_inode;
> +	}
> +
> 	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
> 	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
> 	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
> @@ -5138,6 +5182,12 @@ static int ext4_do_update_inode(handle_t *handle,
> 			raw_inode->i_version_hi =
> 			cpu_to_le32(inode->i_version >> 32);
> 		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
> +
> +		if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
> +			EXT4_FEATURE_RO_COMPAT_INODE_CSUM) &&
> +		    EXT4_FITS_IN_INODE(raw_inode, ei, i_checksum))
> +			raw_inode->i_checksum =
> +				ext4_inode_csum(inode, raw_inode);
> 	}
> 
> 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");


Cheers, Andreas





--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Joel Becker - April 8, 2011, 6:50 p.m.
On Thu, Apr 07, 2011 at 10:10:52AM -0700, Sunil Mushran wrote:
> On 04/07/2011 09:40 AM, Darrick J. Wong wrote:
> >That said, I haven't really quantified the performance impact of this naive
> >approach yet, so I wonder -- did you see a similar scenario with ocfs2, and
> >what kind of performance increase did you get by adapting the code to use the
> >jbd2 trigger?  If there's potentially a large increase, it would be interesting
> >to apply the same conversion to the group descriptor checksumming code too.
> 
> Joel Becker may remember the overhead. He wrote the patch. That said we have few
> differences. ocfs2 has larger (blocksized) inodes. Also, it computes ECC. The code
> is in fs/ocfs2/blockcheck.c.

	ocfs2 does the journal access/journal dirty cycle a lot more
than extN.  I think you'd want to generate your own numbers.

Joel
Darrick J. Wong - April 8, 2011, 7:12 p.m.
On Fri, Apr 08, 2011 at 02:58:27AM -0600, Andreas Dilger wrote:
> On 2011-04-06, at 4:45 PM, Darrick J. Wong wrote:
> > This patch introduces to ext4 the ability to calculate and verify inode
> > checksums.  This requires the use of a new ro compatibility flag and some
> > accompanying e2fsprogs patches to provide the relevant features in tune2fs and
> > e2fsck.
> > 
> > Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
> > ---
> > 
> > fs/ext4/ext4.h  |    6 ++++--
> > fs/ext4/inode.c |   52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
> > 2 files changed, 55 insertions(+), 3 deletions(-)
> > 
> > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > index 4daaf2b..8815928 100644
> > --- a/fs/ext4/ext4.h
> > +++ b/fs/ext4/ext4.h
> > @@ -617,7 +617,7 @@ struct ext4_inode {
> > 		} masix2;
> > 	} osd2;				/* OS dependent 2 */
> > 	__le16	i_extra_isize;
> > -	__le16	i_pad1;
> > +	__le16  i_checksum;		/* crc16(sb_uuid+inodenum+inode) */
> 
> In previous discussions, we thought about using part/all of the l_i_reserved2
> field to store the inode checksum.  The inode checksum is important enough to
> warrant a field in the core inode, so that it also works on upgraded
> filesystems.

Hmm, yes, I better like using that field too, it'll simplify the code.  Does
anyone know which of the s_creator_os codes map to the Linux format?  I'm going
to guess 1 and 2 don't... for now I suppose the mount code can check for a
Linux creator code if the inode checksum feature is set, and fail the mount if
it finds 1 or 2.  I'm not sure which version of the osd2 union FreeBSD or
"Lites" use.

> > +static __le16 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw)
> > +{
> > +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> > +	struct ext4_inode_info *ei = EXT4_I(inode);
> > +	int offset = offsetof(struct ext4_inode, i_checksum);
> > +	__le32 inum = cpu_to_le32(inode->i_ino);
> > +	__u16 crc = 0;
> > +
> > +	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
> > +		EXT4_FEATURE_RO_COMPAT_INODE_CSUM) &&
> > +	    le16_to_cpu(raw->i_extra_isize) >= 4) {
> 
> If this field remains in the large part of the inode, then this check is
> incorrect.  i_extra_isize is itself only valid if (s_inode_size is >
> EXT4_GOOD_OLD_INODE_SIZE), otherwise it points at the next inode's i_mode.
> 
> Also, instead of hard-coding ">= 4" here, it would be better to use
> EXT4_FITS_IN_INODE(raw, EXT4_I(inode), i_checksum).  Probably no reason to
> put "ei" on the stack at all.

As always, thank you for the feedback!

--D
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong - April 8, 2011, 7:30 p.m.
On Fri, Apr 08, 2011 at 11:50:13AM -0700, Joel Becker wrote:
> On Thu, Apr 07, 2011 at 10:10:52AM -0700, Sunil Mushran wrote:
> > On 04/07/2011 09:40 AM, Darrick J. Wong wrote:
> > >That said, I haven't really quantified the performance impact of this naive
> > >approach yet, so I wonder -- did you see a similar scenario with ocfs2, and
> > >what kind of performance increase did you get by adapting the code to use the
> > >jbd2 trigger?  If there's potentially a large increase, it would be interesting
> > >to apply the same conversion to the group descriptor checksumming code too.
> > 
> > Joel Becker may remember the overhead. He wrote the patch. That said we have few
> > differences. ocfs2 has larger (blocksized) inodes. Also, it computes ECC. The code
> > is in fs/ocfs2/blockcheck.c.

Heh, yes, ext4 uses a fairly simple crc16 and the inodes are (most likely) not
block sized.

> 	ocfs2 does the journal access/journal dirty cycle a lot more
> than extN.  I think you'd want to generate your own numbers.

Ok, I ran both the mailserver ffsb profile and a quick-and-dumb test that tried
to dirty inodes as fast as it could.  On both a regular disk, an SSD, and a
loopmounted ext4 on a tmpfs I couldn't really see much of a performance
difference at all.  I'll see about giving this a try once I get the field
location and e2fsck behavior more firmly resolved, though I suspect I won't see
much gain.

--D
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger - April 8, 2011, 10:49 p.m.
On 2011-04-08, at 1:12 PM, Darrick J. Wong wrote:
> On Fri, Apr 08, 2011 at 02:58:27AM -0600, Andreas Dilger wrote:
>> On 2011-04-06, at 4:45 PM, Darrick J. Wong wrote:
>>> @@ -617,7 +617,7 @@ struct ext4_inode {
>>> 		} masix2;
>>> 	} osd2;				/* OS dependent 2 */
>>> 	__le16	i_extra_isize;
>>> -	__le16	i_pad1;
>>> +	__le16  i_checksum;		/* crc16(sb_uuid+inodenum+inode) */
>> 
>> In previous discussions, we thought about using part/all of the l_i_reserved2
>> field to store the inode checksum.  The inode checksum is important enough to
>> warrant a field in the core inode, so that it also works on upgraded
>> filesystems.
> 
> Hmm, yes, I better like using that field too, it'll simplify the code.  Does
> anyone know which of the s_creator_os codes map to the Linux format?  I'm going
> to guess 1 and 2 don't... for now I suppose the mount code can check for a
> Linux creator code if the inode checksum feature is set, and fail the mount if
> it finds 1 or 2.  I'm not sure which version of the osd2 union FreeBSD or
> "Lites" use.

Actually, the e2fsprogs code has already removed the "masix" part of the union years ago, and I'm not even sure why the "hurd" part of the union remains in the kernel code.  I think only Hurd is using the "h_i_author" field, so I suspect all that is needed is to refuse tune2fs to enable EXT4_FEATURE_RO_COMPAT_INODE_CSUM if s_creator_os == EXT2_OS_HURD (1).

The kernel shouldn't really care about this field - if the checksum feature isn't enabled, then this field will be ignored, and if the checksum feature IS enabled it could only have been done by mke2fs, or tune2fs which should check if it is allowed based on s_creator_os.

>>> +static __le16 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw)
>>> +{
>>> +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>>> +	struct ext4_inode_info *ei = EXT4_I(inode);
>>> +	int offset = offsetof(struct ext4_inode, i_checksum);
>>> +	__le32 inum = cpu_to_le32(inode->i_ino);
>>> +	__u16 crc = 0;
>>> +
>>> +	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
>>> +		EXT4_FEATURE_RO_COMPAT_INODE_CSUM) &&
>>> +	    le16_to_cpu(raw->i_extra_isize) >= 4) {
>> 
>> If this field remains in the large part of the inode, then this check is
>> incorrect.  i_extra_isize is itself only valid if (s_inode_size is >
>> EXT4_GOOD_OLD_INODE_SIZE), otherwise it points at the next inode's i_mode.
>> 
>> Also, instead of hard-coding ">= 4" here, it would be better to use
>> EXT4_FITS_IN_INODE(raw, EXT4_I(inode), i_checksum).  Probably no reason to
>> put "ei" on the stack at all.
> 
> As always, thank you for the feedback!
> 
> --D


Cheers, Andreas





--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4daaf2b..8815928 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -617,7 +617,7 @@  struct ext4_inode {
 		} masix2;
 	} osd2;				/* OS dependent 2 */
 	__le16	i_extra_isize;
-	__le16	i_pad1;
+	__le16  i_checksum;		/* crc16(sb_uuid+inodenum+inode) */
 	__le32  i_ctime_extra;  /* extra Change time      (nsec << 2 | epoch) */
 	__le32  i_mtime_extra;  /* extra Modification time(nsec << 2 | epoch) */
 	__le32  i_atime_extra;  /* extra Access time      (nsec << 2 | epoch) */
@@ -1338,6 +1338,7 @@  static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM		0x0010
 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK	0x0020
 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE	0x0040
+#define EXT4_FEATURE_RO_COMPAT_INODE_CSUM	0x0400
 
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
@@ -1364,7 +1365,8 @@  static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 					 EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
 					 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
 					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
-					 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)
+					 EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
+					 EXT4_FEATURE_RO_COMPAT_INODE_CSUM)
 
 /*
  * Default values for user and/or group using reserved blocks
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1a86282..dc76f19 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -42,6 +42,7 @@ 
 #include <linux/printk.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
+#include <linux/crc16.h>
 
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -52,6 +53,40 @@ 
 
 #define MPAGE_DA_EXTENT_TAIL 0x01
 
+static __le16 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	int offset = offsetof(struct ext4_inode, i_checksum);
+	__le32 inum = cpu_to_le32(inode->i_ino);
+	__u16 crc = 0;
+
+	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+		EXT4_FEATURE_RO_COMPAT_INODE_CSUM) &&
+	    le16_to_cpu(raw->i_extra_isize) >= 4) {
+		crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
+		crc = crc16(crc, (__u8 *)&inum, sizeof(inum));
+		crc = crc16(crc, (__u8 *)raw, offset);
+		offset += sizeof(raw->i_checksum); /* skip checksum */
+		/* for checksum of struct ext4_inode do the rest...*/
+		if (ei->i_extra_isize > 4)
+			crc = crc16(crc, (__u8 *)raw + offset,
+				    ei->i_extra_isize - 4);
+	}
+
+	return cpu_to_le16(crc);
+}
+
+static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw)
+{
+	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+		EXT4_FEATURE_RO_COMPAT_INODE_CSUM) &&
+	    (raw->i_checksum != ext4_inode_csum(inode, raw)))
+		return 0;
+
+	return 1;
+}
+
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
 {
@@ -4802,7 +4837,8 @@  struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	struct ext4_inode *raw_inode;
 	struct ext4_inode_info *ei;
 	struct inode *inode;
-	journal_t *journal = EXT4_SB(sb)->s_journal;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	journal_t *journal = sbi->s_journal;
 	long ret;
 	int block;
 
@@ -4916,6 +4952,14 @@  struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	} else
 		ei->i_extra_isize = 0;
 
+	if (!ext4_inode_csum_verify(inode, raw_inode)) {
+		EXT4_ERROR_INODE(inode, "checksum invalid (%u != %u)",
+		       le16_to_cpu(ext4_inode_csum(inode, raw_inode)),
+		       le16_to_cpu(raw_inode->i_checksum));
+		ret = -EIO;
+		goto bad_inode;
+	}
+
 	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
 	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
 	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
@@ -5138,6 +5182,12 @@  static int ext4_do_update_inode(handle_t *handle,
 			raw_inode->i_version_hi =
 			cpu_to_le32(inode->i_version >> 32);
 		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+
+		if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+			EXT4_FEATURE_RO_COMPAT_INODE_CSUM) &&
+		    EXT4_FITS_IN_INODE(raw_inode, ei, i_checksum))
+			raw_inode->i_checksum =
+				ext4_inode_csum(inode, raw_inode);
 	}
 
 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");