Message ID | 20110406224547.GT32706@tux1.beaverton.ibm.com |
---|---|
State | Superseded, archived |
Headers | show |
On 04/06/2011 03:45 PM, Darrick J. Wong wrote: > This patch introduces to ext4 the ability to calculate and verify inode > checksums. This requires the use of a new ro compatibility flag and some > accompanying e2fsprogs patches to provide the relevant features in tune2fs and > e2fsck. > > Signed-off-by: Darrick J. Wong<djwong@us.ibm.com> > --- > > fs/ext4/ext4.h | 6 ++++-- > fs/ext4/inode.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++- > 2 files changed, 55 insertions(+), 3 deletions(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 4daaf2b..8815928 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -617,7 +617,7 @@ struct ext4_inode { > } masix2; > } osd2; /* OS dependent 2 */ > __le16 i_extra_isize; > - __le16 i_pad1; > + __le16 i_checksum; /* crc16(sb_uuid+inodenum+inode) */ > __le32 i_ctime_extra; /* extra Change time (nsec<< 2 | epoch) */ > __le32 i_mtime_extra; /* extra Modification time(nsec<< 2 | epoch) */ > __le32 i_atime_extra; /* extra Access time (nsec<< 2 | epoch) */ > @@ -1338,6 +1338,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) > #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 > #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 > #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 > +#define EXT4_FEATURE_RO_COMPAT_INODE_CSUM 0x0400 > > #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 > #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 > @@ -1364,7 +1365,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) > EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ > EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ > EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ > - EXT4_FEATURE_RO_COMPAT_HUGE_FILE) > + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ > + EXT4_FEATURE_RO_COMPAT_INODE_CSUM) > > /* > * Default values for user and/or group using reserved blocks > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index 1a86282..dc76f19 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -42,6 +42,7 @@ > #include<linux/printk.h> > #include<linux/slab.h> > #include<linux/ratelimit.h> > +#include<linux/crc16.h> > > #include "ext4_jbd2.h" > #include "xattr.h" > @@ -52,6 +53,40 @@ > > #define MPAGE_DA_EXTENT_TAIL 0x01 > > +static __le16 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw) > +{ > + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); > + struct ext4_inode_info *ei = EXT4_I(inode); > + int offset = offsetof(struct ext4_inode, i_checksum); > + __le32 inum = cpu_to_le32(inode->i_ino); > + __u16 crc = 0; > + > + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, > + EXT4_FEATURE_RO_COMPAT_INODE_CSUM)&& > + le16_to_cpu(raw->i_extra_isize)>= 4) { > + crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); > + crc = crc16(crc, (__u8 *)&inum, sizeof(inum)); > + crc = crc16(crc, (__u8 *)raw, offset); > + offset += sizeof(raw->i_checksum); /* skip checksum */ > + /* for checksum of struct ext4_inode do the rest...*/ > + if (ei->i_extra_isize> 4) > + crc = crc16(crc, (__u8 *)raw + offset, > + ei->i_extra_isize - 4); > + } > + > + return cpu_to_le16(crc); > +} > + > +static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw) > +{ > + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, > + EXT4_FEATURE_RO_COMPAT_INODE_CSUM)&& > + (raw->i_checksum != ext4_inode_csum(inode, raw))) > + return 0; > + > + return 1; > +} > + > static inline int ext4_begin_ordered_truncate(struct inode *inode, > loff_t new_size) > { > @@ -4802,7 +4837,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) > struct ext4_inode *raw_inode; > struct ext4_inode_info *ei; > struct inode *inode; > - journal_t *journal = EXT4_SB(sb)->s_journal; > + struct ext4_sb_info *sbi = EXT4_SB(sb); > + journal_t *journal = sbi->s_journal; > long ret; > int block; > > @@ -4916,6 +4952,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) > } else > ei->i_extra_isize = 0; > > + if (!ext4_inode_csum_verify(inode, raw_inode)) { > + EXT4_ERROR_INODE(inode, "checksum invalid (%u != %u)", > + le16_to_cpu(ext4_inode_csum(inode, raw_inode)), > + le16_to_cpu(raw_inode->i_checksum)); > + ret = -EIO; > + goto bad_inode; > + } > + > EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); > EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); > EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); > @@ -5138,6 +5182,12 @@ static int ext4_do_update_inode(handle_t *handle, > raw_inode->i_version_hi = > cpu_to_le32(inode->i_version>> 32); > raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); > + > + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, > + EXT4_FEATURE_RO_COMPAT_INODE_CSUM)&& > + EXT4_FITS_IN_INODE(raw_inode, ei, i_checksum)) > + raw_inode->i_checksum = > + ext4_inode_csum(inode, raw_inode); > } > > BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); You may want to look into jbd2 buffer triggers. struct jbd2_buffer_trigger_type Instead of computing checksum on every update, it allows one to compute it just before the journal write. More efficient. -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Apr 06, 2011 at 05:52:43PM -0700, Sunil Mushran wrote: > On 04/06/2011 03:45 PM, Darrick J. Wong wrote: >> This patch introduces to ext4 the ability to calculate and verify inode >> checksums. This requires the use of a new ro compatibility flag and some >> accompanying e2fsprogs patches to provide the relevant features in tune2fs and >> e2fsck. >> >> Signed-off-by: Darrick J. Wong<djwong@us.ibm.com> >> --- >> >> fs/ext4/ext4.h | 6 ++++-- >> fs/ext4/inode.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++- >> 2 files changed, 55 insertions(+), 3 deletions(-) >> >> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h >> index 4daaf2b..8815928 100644 >> --- a/fs/ext4/ext4.h >> +++ b/fs/ext4/ext4.h >> @@ -617,7 +617,7 @@ struct ext4_inode { >> } masix2; >> } osd2; /* OS dependent 2 */ >> __le16 i_extra_isize; >> - __le16 i_pad1; >> + __le16 i_checksum; /* crc16(sb_uuid+inodenum+inode) */ >> __le32 i_ctime_extra; /* extra Change time (nsec<< 2 | epoch) */ >> __le32 i_mtime_extra; /* extra Modification time(nsec<< 2 | epoch) */ >> __le32 i_atime_extra; /* extra Access time (nsec<< 2 | epoch) */ >> @@ -1338,6 +1338,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) >> #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 >> #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 >> #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 >> +#define EXT4_FEATURE_RO_COMPAT_INODE_CSUM 0x0400 >> >> #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 >> #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 >> @@ -1364,7 +1365,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) >> EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ >> EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ >> EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ >> - EXT4_FEATURE_RO_COMPAT_HUGE_FILE) >> + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ >> + EXT4_FEATURE_RO_COMPAT_INODE_CSUM) >> >> /* >> * Default values for user and/or group using reserved blocks >> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c >> index 1a86282..dc76f19 100644 >> --- a/fs/ext4/inode.c >> +++ b/fs/ext4/inode.c >> @@ -42,6 +42,7 @@ >> #include<linux/printk.h> >> #include<linux/slab.h> >> #include<linux/ratelimit.h> >> +#include<linux/crc16.h> >> >> #include "ext4_jbd2.h" >> #include "xattr.h" >> @@ -52,6 +53,40 @@ >> >> #define MPAGE_DA_EXTENT_TAIL 0x01 >> >> +static __le16 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw) >> +{ >> + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); >> + struct ext4_inode_info *ei = EXT4_I(inode); >> + int offset = offsetof(struct ext4_inode, i_checksum); >> + __le32 inum = cpu_to_le32(inode->i_ino); >> + __u16 crc = 0; >> + >> + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, >> + EXT4_FEATURE_RO_COMPAT_INODE_CSUM)&& >> + le16_to_cpu(raw->i_extra_isize)>= 4) { >> + crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); >> + crc = crc16(crc, (__u8 *)&inum, sizeof(inum)); >> + crc = crc16(crc, (__u8 *)raw, offset); >> + offset += sizeof(raw->i_checksum); /* skip checksum */ >> + /* for checksum of struct ext4_inode do the rest...*/ >> + if (ei->i_extra_isize> 4) >> + crc = crc16(crc, (__u8 *)raw + offset, >> + ei->i_extra_isize - 4); >> + } >> + >> + return cpu_to_le16(crc); >> +} >> + >> +static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw) >> +{ >> + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, >> + EXT4_FEATURE_RO_COMPAT_INODE_CSUM)&& >> + (raw->i_checksum != ext4_inode_csum(inode, raw))) >> + return 0; >> + >> + return 1; >> +} >> + >> static inline int ext4_begin_ordered_truncate(struct inode *inode, >> loff_t new_size) >> { >> @@ -4802,7 +4837,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) >> struct ext4_inode *raw_inode; >> struct ext4_inode_info *ei; >> struct inode *inode; >> - journal_t *journal = EXT4_SB(sb)->s_journal; >> + struct ext4_sb_info *sbi = EXT4_SB(sb); >> + journal_t *journal = sbi->s_journal; >> long ret; >> int block; >> >> @@ -4916,6 +4952,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) >> } else >> ei->i_extra_isize = 0; >> >> + if (!ext4_inode_csum_verify(inode, raw_inode)) { >> + EXT4_ERROR_INODE(inode, "checksum invalid (%u != %u)", >> + le16_to_cpu(ext4_inode_csum(inode, raw_inode)), >> + le16_to_cpu(raw_inode->i_checksum)); >> + ret = -EIO; >> + goto bad_inode; >> + } >> + >> EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); >> EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); >> EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); >> @@ -5138,6 +5182,12 @@ static int ext4_do_update_inode(handle_t *handle, >> raw_inode->i_version_hi = >> cpu_to_le32(inode->i_version>> 32); >> raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); >> + >> + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, >> + EXT4_FEATURE_RO_COMPAT_INODE_CSUM)&& >> + EXT4_FITS_IN_INODE(raw_inode, ei, i_checksum)) >> + raw_inode->i_checksum = >> + ext4_inode_csum(inode, raw_inode); >> } >> >> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); > > You may want to look into jbd2 buffer triggers. struct jbd2_buffer_trigger_type > Instead of computing checksum on every update, it allows one to compute it > just before the journal write. More efficient. Yes, I see that jbd2 has triggers, looks like a nifty feature. I suppose if I went with that approach I'd still have to calculate the checksum in ext4_do_update_inode in the nojournal case, and in the journal case I'd write a trigger that would figure out which inodes in a given buffer are actually dirty and compute their checksums. That said, I haven't really quantified the performance impact of this naive approach yet, so I wonder -- did you see a similar scenario with ocfs2, and what kind of performance increase did you get by adapting the code to use the jbd2 trigger? If there's potentially a large increase, it would be interesting to apply the same conversion to the group descriptor checksumming code too. --D -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 04/07/2011 09:40 AM, Darrick J. Wong wrote: > Yes, I see that jbd2 has triggers, looks like a nifty feature. I suppose if I > went with that approach I'd still have to calculate the checksum in > ext4_do_update_inode in the nojournal case, and in the journal case I'd write a > trigger that would figure out which inodes in a given buffer are actually dirty > and compute their checksums. Yes, you could use a in-mem flag (set in update, cleared in trigger) to identify dirty inodes. The discussion on stable pages could be relevant for the nojournal case. > > That said, I haven't really quantified the performance impact of this naive > approach yet, so I wonder -- did you see a similar scenario with ocfs2, and > what kind of performance increase did you get by adapting the code to use the > jbd2 trigger? If there's potentially a large increase, it would be interesting > to apply the same conversion to the group descriptor checksumming code too. Joel Becker may remember the overhead. He wrote the patch. That said we have few differences. ocfs2 has larger (blocksized) inodes. Also, it computes ECC. The code is in fs/ocfs2/blockcheck.c. As in, you may want to generate ext4 specific numbers. -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2011-04-06, at 4:45 PM, Darrick J. Wong wrote: > This patch introduces to ext4 the ability to calculate and verify inode > checksums. This requires the use of a new ro compatibility flag and some > accompanying e2fsprogs patches to provide the relevant features in tune2fs and > e2fsck. > > Signed-off-by: Darrick J. Wong <djwong@us.ibm.com> > --- > > fs/ext4/ext4.h | 6 ++++-- > fs/ext4/inode.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++- > 2 files changed, 55 insertions(+), 3 deletions(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 4daaf2b..8815928 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -617,7 +617,7 @@ struct ext4_inode { > } masix2; > } osd2; /* OS dependent 2 */ > __le16 i_extra_isize; > - __le16 i_pad1; > + __le16 i_checksum; /* crc16(sb_uuid+inodenum+inode) */ In previous discussions, we thought about using part/all of the l_i_reserved2 field to store the inode checksum. The inode checksum is important enough to warrant a field in the core inode, so that it also works on upgraded filesystems. > +static __le16 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw) > +{ > + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); > + struct ext4_inode_info *ei = EXT4_I(inode); > + int offset = offsetof(struct ext4_inode, i_checksum); > + __le32 inum = cpu_to_le32(inode->i_ino); > + __u16 crc = 0; > + > + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, > + EXT4_FEATURE_RO_COMPAT_INODE_CSUM) && > + le16_to_cpu(raw->i_extra_isize) >= 4) { If this field remains in the large part of the inode, then this check is incorrect. i_extra_isize is itself only valid if (s_inode_size is > EXT4_GOOD_OLD_INODE_SIZE), otherwise it points at the next inode's i_mode. Also, instead of hard-coding ">= 4" here, it would be better to use EXT4_FITS_IN_INODE(raw, EXT4_I(inode), i_checksum). Probably no reason to put "ei" on the stack at all. > + crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); > + crc = crc16(crc, (__u8 *)&inum, sizeof(inum)); > + crc = crc16(crc, (__u8 *)raw, offset); > + offset += sizeof(raw->i_checksum); /* skip checksum */ > + /* for checksum of struct ext4_inode do the rest...*/ > + if (ei->i_extra_isize > 4) > + crc = crc16(crc, (__u8 *)raw + offset, > + ei->i_extra_isize - 4); > + } > + > + return cpu_to_le16(crc); > +} > + > +static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw) > +{ > + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, > + EXT4_FEATURE_RO_COMPAT_INODE_CSUM) && > + (raw->i_checksum != ext4_inode_csum(inode, raw))) > + return 0; > + > + return 1; > +} > + > static inline int ext4_begin_ordered_truncate(struct inode *inode, > loff_t new_size) > { > @@ -4802,7 +4837,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) > struct ext4_inode *raw_inode; > struct ext4_inode_info *ei; > struct inode *inode; > - journal_t *journal = EXT4_SB(sb)->s_journal; > + struct ext4_sb_info *sbi = EXT4_SB(sb); > + journal_t *journal = sbi->s_journal; > long ret; > int block; > > @@ -4916,6 +4952,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) > } else > ei->i_extra_isize = 0; > > + if (!ext4_inode_csum_verify(inode, raw_inode)) { > + EXT4_ERROR_INODE(inode, "checksum invalid (%u != %u)", > + le16_to_cpu(ext4_inode_csum(inode, raw_inode)), > + le16_to_cpu(raw_inode->i_checksum)); > + ret = -EIO; > + goto bad_inode; > + } > + > EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); > EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); > EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); > @@ -5138,6 +5182,12 @@ static int ext4_do_update_inode(handle_t *handle, > raw_inode->i_version_hi = > cpu_to_le32(inode->i_version >> 32); > raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); > + > + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, > + EXT4_FEATURE_RO_COMPAT_INODE_CSUM) && > + EXT4_FITS_IN_INODE(raw_inode, ei, i_checksum)) > + raw_inode->i_checksum = > + ext4_inode_csum(inode, raw_inode); > } > > BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); Cheers, Andreas -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Apr 07, 2011 at 10:10:52AM -0700, Sunil Mushran wrote: > On 04/07/2011 09:40 AM, Darrick J. Wong wrote: > >That said, I haven't really quantified the performance impact of this naive > >approach yet, so I wonder -- did you see a similar scenario with ocfs2, and > >what kind of performance increase did you get by adapting the code to use the > >jbd2 trigger? If there's potentially a large increase, it would be interesting > >to apply the same conversion to the group descriptor checksumming code too. > > Joel Becker may remember the overhead. He wrote the patch. That said we have few > differences. ocfs2 has larger (blocksized) inodes. Also, it computes ECC. The code > is in fs/ocfs2/blockcheck.c. ocfs2 does the journal access/journal dirty cycle a lot more than extN. I think you'd want to generate your own numbers. Joel
On Fri, Apr 08, 2011 at 02:58:27AM -0600, Andreas Dilger wrote: > On 2011-04-06, at 4:45 PM, Darrick J. Wong wrote: > > This patch introduces to ext4 the ability to calculate and verify inode > > checksums. This requires the use of a new ro compatibility flag and some > > accompanying e2fsprogs patches to provide the relevant features in tune2fs and > > e2fsck. > > > > Signed-off-by: Darrick J. Wong <djwong@us.ibm.com> > > --- > > > > fs/ext4/ext4.h | 6 ++++-- > > fs/ext4/inode.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++- > > 2 files changed, 55 insertions(+), 3 deletions(-) > > > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > > index 4daaf2b..8815928 100644 > > --- a/fs/ext4/ext4.h > > +++ b/fs/ext4/ext4.h > > @@ -617,7 +617,7 @@ struct ext4_inode { > > } masix2; > > } osd2; /* OS dependent 2 */ > > __le16 i_extra_isize; > > - __le16 i_pad1; > > + __le16 i_checksum; /* crc16(sb_uuid+inodenum+inode) */ > > In previous discussions, we thought about using part/all of the l_i_reserved2 > field to store the inode checksum. The inode checksum is important enough to > warrant a field in the core inode, so that it also works on upgraded > filesystems. Hmm, yes, I better like using that field too, it'll simplify the code. Does anyone know which of the s_creator_os codes map to the Linux format? I'm going to guess 1 and 2 don't... for now I suppose the mount code can check for a Linux creator code if the inode checksum feature is set, and fail the mount if it finds 1 or 2. I'm not sure which version of the osd2 union FreeBSD or "Lites" use. > > +static __le16 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw) > > +{ > > + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); > > + struct ext4_inode_info *ei = EXT4_I(inode); > > + int offset = offsetof(struct ext4_inode, i_checksum); > > + __le32 inum = cpu_to_le32(inode->i_ino); > > + __u16 crc = 0; > > + > > + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, > > + EXT4_FEATURE_RO_COMPAT_INODE_CSUM) && > > + le16_to_cpu(raw->i_extra_isize) >= 4) { > > If this field remains in the large part of the inode, then this check is > incorrect. i_extra_isize is itself only valid if (s_inode_size is > > EXT4_GOOD_OLD_INODE_SIZE), otherwise it points at the next inode's i_mode. > > Also, instead of hard-coding ">= 4" here, it would be better to use > EXT4_FITS_IN_INODE(raw, EXT4_I(inode), i_checksum). Probably no reason to > put "ei" on the stack at all. As always, thank you for the feedback! --D -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Apr 08, 2011 at 11:50:13AM -0700, Joel Becker wrote: > On Thu, Apr 07, 2011 at 10:10:52AM -0700, Sunil Mushran wrote: > > On 04/07/2011 09:40 AM, Darrick J. Wong wrote: > > >That said, I haven't really quantified the performance impact of this naive > > >approach yet, so I wonder -- did you see a similar scenario with ocfs2, and > > >what kind of performance increase did you get by adapting the code to use the > > >jbd2 trigger? If there's potentially a large increase, it would be interesting > > >to apply the same conversion to the group descriptor checksumming code too. > > > > Joel Becker may remember the overhead. He wrote the patch. That said we have few > > differences. ocfs2 has larger (blocksized) inodes. Also, it computes ECC. The code > > is in fs/ocfs2/blockcheck.c. Heh, yes, ext4 uses a fairly simple crc16 and the inodes are (most likely) not block sized. > ocfs2 does the journal access/journal dirty cycle a lot more > than extN. I think you'd want to generate your own numbers. Ok, I ran both the mailserver ffsb profile and a quick-and-dumb test that tried to dirty inodes as fast as it could. On both a regular disk, an SSD, and a loopmounted ext4 on a tmpfs I couldn't really see much of a performance difference at all. I'll see about giving this a try once I get the field location and e2fsck behavior more firmly resolved, though I suspect I won't see much gain. --D -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2011-04-08, at 1:12 PM, Darrick J. Wong wrote: > On Fri, Apr 08, 2011 at 02:58:27AM -0600, Andreas Dilger wrote: >> On 2011-04-06, at 4:45 PM, Darrick J. Wong wrote: >>> @@ -617,7 +617,7 @@ struct ext4_inode { >>> } masix2; >>> } osd2; /* OS dependent 2 */ >>> __le16 i_extra_isize; >>> - __le16 i_pad1; >>> + __le16 i_checksum; /* crc16(sb_uuid+inodenum+inode) */ >> >> In previous discussions, we thought about using part/all of the l_i_reserved2 >> field to store the inode checksum. The inode checksum is important enough to >> warrant a field in the core inode, so that it also works on upgraded >> filesystems. > > Hmm, yes, I better like using that field too, it'll simplify the code. Does > anyone know which of the s_creator_os codes map to the Linux format? I'm going > to guess 1 and 2 don't... for now I suppose the mount code can check for a > Linux creator code if the inode checksum feature is set, and fail the mount if > it finds 1 or 2. I'm not sure which version of the osd2 union FreeBSD or > "Lites" use. Actually, the e2fsprogs code has already removed the "masix" part of the union years ago, and I'm not even sure why the "hurd" part of the union remains in the kernel code. I think only Hurd is using the "h_i_author" field, so I suspect all that is needed is to refuse tune2fs to enable EXT4_FEATURE_RO_COMPAT_INODE_CSUM if s_creator_os == EXT2_OS_HURD (1). The kernel shouldn't really care about this field - if the checksum feature isn't enabled, then this field will be ignored, and if the checksum feature IS enabled it could only have been done by mke2fs, or tune2fs which should check if it is allowed based on s_creator_os. >>> +static __le16 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw) >>> +{ >>> + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); >>> + struct ext4_inode_info *ei = EXT4_I(inode); >>> + int offset = offsetof(struct ext4_inode, i_checksum); >>> + __le32 inum = cpu_to_le32(inode->i_ino); >>> + __u16 crc = 0; >>> + >>> + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, >>> + EXT4_FEATURE_RO_COMPAT_INODE_CSUM) && >>> + le16_to_cpu(raw->i_extra_isize) >= 4) { >> >> If this field remains in the large part of the inode, then this check is >> incorrect. i_extra_isize is itself only valid if (s_inode_size is > >> EXT4_GOOD_OLD_INODE_SIZE), otherwise it points at the next inode's i_mode. >> >> Also, instead of hard-coding ">= 4" here, it would be better to use >> EXT4_FITS_IN_INODE(raw, EXT4_I(inode), i_checksum). Probably no reason to >> put "ei" on the stack at all. > > As always, thank you for the feedback! > > --D Cheers, Andreas -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4daaf2b..8815928 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -617,7 +617,7 @@ struct ext4_inode { } masix2; } osd2; /* OS dependent 2 */ __le16 i_extra_isize; - __le16 i_pad1; + __le16 i_checksum; /* crc16(sb_uuid+inodenum+inode) */ __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ @@ -1338,6 +1338,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 +#define EXT4_FEATURE_RO_COMPAT_INODE_CSUM 0x0400 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -1364,7 +1365,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ - EXT4_FEATURE_RO_COMPAT_HUGE_FILE) + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ + EXT4_FEATURE_RO_COMPAT_INODE_CSUM) /* * Default values for user and/or group using reserved blocks diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1a86282..dc76f19 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -42,6 +42,7 @@ #include <linux/printk.h> #include <linux/slab.h> #include <linux/ratelimit.h> +#include <linux/crc16.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -52,6 +53,40 @@ #define MPAGE_DA_EXTENT_TAIL 0x01 +static __le16 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + int offset = offsetof(struct ext4_inode, i_checksum); + __le32 inum = cpu_to_le32(inode->i_ino); + __u16 crc = 0; + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_INODE_CSUM) && + le16_to_cpu(raw->i_extra_isize) >= 4) { + crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); + crc = crc16(crc, (__u8 *)&inum, sizeof(inum)); + crc = crc16(crc, (__u8 *)raw, offset); + offset += sizeof(raw->i_checksum); /* skip checksum */ + /* for checksum of struct ext4_inode do the rest...*/ + if (ei->i_extra_isize > 4) + crc = crc16(crc, (__u8 *)raw + offset, + ei->i_extra_isize - 4); + } + + return cpu_to_le16(crc); +} + +static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw) +{ + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_INODE_CSUM) && + (raw->i_checksum != ext4_inode_csum(inode, raw))) + return 0; + + return 1; +} + static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { @@ -4802,7 +4837,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) struct ext4_inode *raw_inode; struct ext4_inode_info *ei; struct inode *inode; - journal_t *journal = EXT4_SB(sb)->s_journal; + struct ext4_sb_info *sbi = EXT4_SB(sb); + journal_t *journal = sbi->s_journal; long ret; int block; @@ -4916,6 +4952,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } else ei->i_extra_isize = 0; + if (!ext4_inode_csum_verify(inode, raw_inode)) { + EXT4_ERROR_INODE(inode, "checksum invalid (%u != %u)", + le16_to_cpu(ext4_inode_csum(inode, raw_inode)), + le16_to_cpu(raw_inode->i_checksum)); + ret = -EIO; + goto bad_inode; + } + EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); @@ -5138,6 +5182,12 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_version_hi = cpu_to_le32(inode->i_version >> 32); raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_INODE_CSUM) && + EXT4_FITS_IN_INODE(raw_inode, ei, i_checksum)) + raw_inode->i_checksum = + ext4_inode_csum(inode, raw_inode); } BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
This patch introduces to ext4 the ability to calculate and verify inode checksums. This requires the use of a new ro compatibility flag and some accompanying e2fsprogs patches to provide the relevant features in tune2fs and e2fsck. Signed-off-by: Darrick J. Wong <djwong@us.ibm.com> --- fs/ext4/ext4.h | 6 ++++-- fs/ext4/inode.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 55 insertions(+), 3 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html