Message ID | 1316127052-1890-2-git-send-email-tytso@mit.edu |
---|---|
State | Accepted, archived |
Headers | show |
On 2011-09-15, at 4:50 PM, Theodore Ts'o wrote: > Reserve EXT4_FEATURE_RO_COMPAT_METADATA_CSUM and > EXT2_FEATURE_COMPAT_EXCLUDE_BITMAP. Also reserve fields in the > superblock and the inode for the checksums. In the block group > descriptor, reserve the exclude bitmap field for the snapshot feature, > and checksums for the inode and block allocation bitmaps. > > With this commit, the metadata checksum and exclude bitmap features > should have reserved all of the fields they need in ext4's on-disk > format. > > #define EXT2_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not initialized */ > @@ -363,7 +370,8 @@ struct ext2_inode { > __u16 l_i_file_acl_high; > __u16 l_i_uid_high; /* these 2 fields */ > __u16 l_i_gid_high; /* were reserved2[0] */ > - __u32 l_i_reserved2; > + __u16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) */ > + __u16 l_i_reserved; /* crc32c(uuid+inum+inode) */ > } linux2; The comment for l_i_reserved is incorrect, and the comment should include "LSB"? Also, if the order of these two fields was swapped it would avoid an extra call to the CRC function to checksum the last 2 bytes: __u16 l_i_uid_high; /* these 2 fields */ __u16 l_i_gid_high; /* were reserved2[0] */ - __u32 l_i_reserved2; + __u16 l_i_reserved; + __u16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LSB*/ } linux2; > @@ -422,7 +431,7 @@ struct ext2_inode_large { > } hurd2; > } osd2; /* OS dependent 2 */ > __u16 i_extra_isize; > - __u16 i_pad1; > + __u16 i_checksum_hi; /* crc32c(uuid+inum+inode) */ This comment should include "MSB". > @@ -623,7 +632,8 @@ struct ext2_super_block { > __u32 s_usr_quota_inum; /* inode number of user quota file */ > __u32 s_grp_quota_inum; /* inode number of group quota file */ > __u32 s_overhead_blocks; /* overhead blocks/clusters in fs */ > - __u32 s_reserved[109]; /* Padding to the end of the block */ > + __u32 s_checksum; /* crc32c(superblock) */ > + __u32 s_reserved[108]; /* Padding to the end of the block */ > }; I thought it would be better to move s_checksum to be the last field in the superblock to avoid multiple calls to the CRC function? So: __u32 s_grp_quota_inum; /* inode number of group quota file */ __u32 s_overhead_blocks; /* overhead blocks/clusters in fs */ - __u32 s_reserved[109]; /* Padding to the end of the block */ + __u32 s_reserved[108]; /* Padding to the end of the block */ + __u32 s_checksum; /* crc32c(superblock) */ }; > diff --git a/lib/ext2fs/tst_inode_size.c b/lib/ext2fs/tst_inode_size.c > index 962f1cd..a4f6247 100644 > --- a/lib/ext2fs/tst_inode_size.c > +++ b/lib/ext2fs/tst_inode_size.c > @@ -61,7 +61,8 @@ void check_structure_fields() > check_field(osd2.linux2.l_i_file_acl_high); > check_field(osd2.linux2.l_i_uid_high); > check_field(osd2.linux2.l_i_gid_high); > - check_field(osd2.linux2.l_i_reserved2); > + check_field(osd2.linux2.l_i_checksum_lo); > + check_field(osd2.linux2.l_i_reserved); > printf("Ending offset is %d\n\n", cur_offset); > #endif > } > diff --git a/lib/ext2fs/tst_super_size.c b/lib/ext2fs/tst_super_size.c > index 1e5a524..75659ae 100644 > --- a/lib/ext2fs/tst_super_size.c > +++ b/lib/ext2fs/tst_super_size.c > @@ -126,6 +126,7 @@ void check_superblock_fields() > check_field(s_usr_quota_inum); > check_field(s_grp_quota_inum); > check_field(s_overhead_blocks); > + check_field(s_checksum); > check_field(s_reserved); > printf("Ending offset is %d\n\n", cur_offset); > #endif These two checks would need to be reversed to match the above changes. Cheers, Andreas -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Sep 15, 2011 at 05:09:13PM -0600, Andreas Dilger wrote: > > I thought it would be better to move s_checksum to be the last field in the > superblock to avoid multiple calls to the CRC function? Did you see my comment about just zero'ing the checksum field before running the CRC? We're going to have to do that for other data structures, such as the inode structure, and it's what we do with the block group descriptor checksum. Might as well do that everywhere for consistency's sake. - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2011-09-15, at 5:11 PM, Ted Ts'o wrote: > On Thu, Sep 15, 2011 at 05:09:13PM -0600, Andreas Dilger wrote: >> >> I thought it would be better to move s_checksum to be the last field in the >> superblock to avoid multiple calls to the CRC function? > > Did you see my comment about just zero'ing the checksum field before > running the CRC? We're going to have to do that for other data > structures, such as the inode structure, and it's what we do with the > block group descriptor checksum. That isn't correct. The group descriptor checksum is computed in chunks: __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, struct ext4_group_desc *gdp) { int offset = offsetof(struct ext4_group_desc, bg_checksum); __le32 le_group = cpu_to_le32(block_group); crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); crc = crc16(crc, (__u8 *)gdp, offset); offset += sizeof(gdp->bg_checksum); /* skip checksum */ ****HERE**** /* for checksum of struct ext4_group_desc do the rest...*/ if ((sbi->s_es->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) && offset < le16_to_cpu(sbi->s_es->s_desc_size)) crc = crc16(crc, (__u8 *)gdp + offset, le16_to_cpu(sbi->s_es->s_desc_size) - offset); } Darrick and I discussed zeroing the checksum fields, but then there is a race with other threads accessing the same structure. If we went to a crc32c LSB for filesystems with RO_COMPAT_CSUM it would be possible to change how it is computed. Since we have freedom to move the checksum field now, why have the added complexity to do zeroing of the field or two chunks? Since we naturally have to break the checksum calculation for 128-byte inodes and 32-byte descriptors, due to old versions of those structs, there is little overhead in just skipping the field, and no races. Cheers, Andreas -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Sep 15, 2011 at 05:34:41PM -0600, Andreas Dilger wrote: > > Darrick and I discussed zeroing the checksum fields, but then there is a > race with other threads accessing the same structure. What race are you worried about? The moment you modify some part of the data structure, the checksum is going to be wrong. This is true whether you zero out the checksum field before you do the calculations or not. > If we went to a crc32c LSB for filesystems with RO_COMPAT_CSUM it would > be possible to change how it is computed. Since we have freedom to move > the checksum field now, why have the added complexity to do zeroing of > the field or two chunks? Why is zero'ing out the field complex? It's a single line of code.... It's certainly easier than doing it in two chunks, and there will be some data structures (the block group descriptors at the very least) where zero'ing the checksum is definitely going to be the easier way to go. - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2011-09-15, at 5:41 PM, Ted Ts'o wrote: > On Thu, Sep 15, 2011 at 05:34:41PM -0600, Andreas Dilger wrote: >> >> Darrick and I discussed zeroing the checksum fields, but then there is a >> race with other threads accessing the same structure. > > What race are you worried about? The moment you modify some part of > the data structure, the checksum is going to be wrong. This is true > whether you zero out the checksum field before you do the calculations > or not. If you are reading the structure (not modifying it) and trying to verify the checksum, it would be necessary to read-lock the structure, zero the field, compute the checksum, reset the field, unlock, and then compare checksums. Alternately, one would have to make a copy of the struct to zero out the field and compute the checksum on the copy. Both are more complex than just doing the checksum on two separate chunks. >> If we went to a crc32c LSB for filesystems with RO_COMPAT_CSUM it would >> be possible to change how it is computed. Since we have freedom to move >> the checksum field now, why have the added complexity to do zeroing of >> the field or two chunks? > > Why is zero'ing out the field complex? It's a single line of code.... > It's certainly easier than doing it in two chunks, and there will be > some data structures (the block group descriptors at the very least) > where zero'ing the checksum is definitely going to be the easier way > to go. No, because for group descriptors, the size is conditional on whether RO_COMPAT_GDT_CSUM is set, so it is just as easy to always compute the first 32-byte crc32 (excluding the bg_checksum field) and then only conditionally compute the second 32-byte checksum. The same is true of the inode checksum and s_inode_size, if the checksum is at the last field of struct ext2_inode. Cheers, Andreas -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Sep 15, 2011 at 06:09:04PM -0600, Andreas Dilger wrote: > On 2011-09-15, at 5:41 PM, Ted Ts'o wrote: > > On Thu, Sep 15, 2011 at 05:34:41PM -0600, Andreas Dilger wrote: > >> > >> Darrick and I discussed zeroing the checksum fields, but then there is a > >> race with other threads accessing the same structure. > > > > What race are you worried about? The moment you modify some part of > > the data structure, the checksum is going to be wrong. This is true > > whether you zero out the checksum field before you do the calculations > > or not. > > If you are reading the structure (not modifying it) and trying to verify > the checksum, it would be necessary to read-lock the structure, zero > the field, compute the checksum, reset the field, unlock, and then > compare checksums. Alternately, one would have to make a copy of the > struct to zero out the field and compute the checksum on the copy. Both > are more complex than just doing the checksum on two separate chunks. I think a lot of the metadata update code already has locking to prevent multiple writers. I'm not as sure about the locking around the read calls, though I suspect that we take some of the same locks before starting the read. However, I've yet to audit all those code paths. Point is, we might already be doing most of the necessary locking. Too bad I'm not sure. > > >> If we went to a crc32c LSB for filesystems with RO_COMPAT_CSUM it would > >> be possible to change how it is computed. Since we have freedom to move > >> the checksum field now, why have the added complexity to do zeroing of > >> the field or two chunks? > > > > Why is zero'ing out the field complex? It's a single line of code.... > > It's certainly easier than doing it in two chunks, and there will be > > some data structures (the block group descriptors at the very least) > > where zero'ing the checksum is definitely going to be the easier way > > to go. > > No, because for group descriptors, the size is conditional on whether > RO_COMPAT_GDT_CSUM is set, so it is just as easy to always compute the > first 32-byte crc32 (excluding the bg_checksum field) and then only > conditionally compute the second 32-byte checksum. The same is true > of the inode checksum and s_inode_size, if the checksum is at the last > field of struct ext2_inode. Um.... it looks to me like the size is conditional on EXT4_FEATURE_INCOMPAT_64BIT being set and then sb.s_desc_size. Are you suggesting that we make the group descriptor checksum a full 32 bits and just stuff the upper 16 bits somewhere near the end of the structure? I suppose that would leave us with very little space in the group descriptor. On the other hand the 32-bit format is totally full already and any size increase depends on 64bit, which means we can make it even bigger with little pain if we need to. Incidentally, I have cached versions of the ext4 disk layout and metadata checksumming wiki pages mirrored here: http://djwong.org/docs/ext4_metadata_checksums.html http://djwong.org/docs/ext4_disk_layout.pdf --D -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Sep 15, 2011 at 06:09:04PM -0600, Andreas Dilger wrote: > > If you are reading the structure (not modifying it) and trying to verify > the checksum, it would be necessary to read-lock the structure, zero > the field, compute the checksum, reset the field, unlock, and then > compare checksums. Alternately, one would have to make a copy of the > struct to zero out the field and compute the checksum on the copy. Both > are more complex than just doing the checksum on two separate chunks. You have to read-lock the structure before calculating the checksum in any case, since otherwise when someone else is modifying the structure, before they have a chance to update the checksum, you'll calculate the checksum and discover that it is incorrect. In practice we would probably be calculating the checksum when the inode if first read into memory, before it it is visible to the rest of the system, so this shouldn't be an issue. But if it is visible to the rest of the system, even you put the checksum at the end, if someone else can modify the data structure while you are calculating it, the checksum will be wrong. > No, because for group descriptors, the size is conditional on whether > RO_COMPAT_GDT_CSUM is set, so it is just as easy to always compute the > first 32-byte crc32 (excluding the bg_checksum field) and then only > conditionally compute the second 32-byte checksum. The same is true > of the inode checksum and s_inode_size, if the checksum is at the last > field of struct ext2_inode. But wouldn't it be faster to zero out the two fields, and then caluclate a single 64-byte checksum? That way we avoid the setup costs of the crc32, especially in the case of the crc32c-sby8-[lb]e implementation. - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2011-09-15, at 7:06 PM, Ted Ts'o wrote: > On Thu, Sep 15, 2011 at 06:09:04PM -0600, Andreas Dilger wrote: >> >> If you are reading the structure (not modifying it) and trying to verify >> the checksum, it would be necessary to read-lock the structure, zero >> the field, compute the checksum, reset the field, unlock, and then >> compare checksums. Alternately, one would have to make a copy of the >> struct to zero out the field and compute the checksum on the copy. Both >> are more complex than just doing the checksum on two separate chunks. > > You have to read-lock the structure before calculating the checksum in > any case, since otherwise when someone else is modifying the > structure, before they have a chance to update the checksum, you'll > calculate the checksum and discover that it is incorrect. Sure, but if the structure is read-locked it shouldn't be modified... > In practice we would probably be calculating the checksum when the > inode if first read into memory, before it it is visible to the rest > of the system, so this shouldn't be an issue. But if it is visible to > the rest of the system, even you put the checksum at the end, if > someone else can modify the data structure while you are calculating > it, the checksum will be wrong. True, but at the same time is there a reason _not_ to put the checksum at the end? For the superblock in particular it seems easy to do and simplifies the code either way. >> No, because for group descriptors, the size is conditional on whether >> RO_COMPAT_GDT_CSUM is set, so it is just as easy to always compute the >> first 32-byte crc32 (excluding the bg_checksum field) and then only >> conditionally compute the second 32-byte checksum. The same is true >> of the inode checksum and s_inode_size, if the checksum is at the last >> field of struct ext2_inode. > > But wouldn't it be faster to zero out the two fields, and then > caluclate a single 64-byte checksum? That way we avoid the setup > costs of the crc32, especially in the case of the crc32c-sby8-[lb]e > implementation. Looking at Darrick's performance tests for the checksums, at 32 bytes the performance is 75%/95% (crc32-sby8-le/crc32-intel) of peak, and 91%/97% of peak for 128 bytes, so the overhead of computing the checksum in two parts is probably not significant. Cheers, Andreas -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2011-09-15, at 6:56 PM, Darrick J. Wong wrote: > On Thu, Sep 15, 2011 at 06:09:04PM -0600, Andreas Dilger wrote: >> If you are reading the structure (not modifying it) and trying to verify >> the checksum, it would be necessary to read-lock the structure, zero >> the field, compute the checksum, reset the field, unlock, and then >> compare checksums. Alternately, one would have to make a copy of the >> struct to zero out the field and compute the checksum on the copy. Both >> are more complex than just doing the checksum on two separate chunks. > > I think a lot of the metadata update code already has locking to prevent > multiple writers. I'm not as sure about the locking around the read calls, > though I suspect that we take some of the same locks before starting the read. > However, I've yet to audit all those code paths. Point is, we might already be > doing most of the necessary locking. Too bad I'm not sure. > >>>> If we went to a crc32c LSB for filesystems with RO_COMPAT_METADATA_CSUM >>>> it would be possible to change how it is computed. Since we have freedom >>>> to move the checksum field now, why have the added complexity to do >>>> zeroing of the field or two chunks? >>> >>> Why is zero'ing out the field complex? It's a single line of code.... >>> It's certainly easier than doing it in two chunks, and there will be >>> some data structures (the block group descriptors at the very least) >>> where zero'ing the checksum is definitely going to be the easier way >>> to go. >> >> No, because for group descriptors, the size is conditional on whether >> RO_COMPAT_GDT_CSUM is set, so it is just as easy to always compute the >> first 32-byte crc32 (excluding the bg_checksum field) and then only >> conditionally compute the second 32-byte checksum. The same is true >> of the inode checksum and s_inode_size, if the checksum is at the last >> field of struct ext2_inode. > > Um.... it looks to me like the size is conditional on > EXT4_FEATURE_INCOMPAT_64BIT being set and then sb.s_desc_size. My bad, I was looking at ext4_group_desc_csum() for the flag name, and my eyes caught the wrong one... > Are you > suggesting that we make the group descriptor checksum a full 32 bits and just > stuff the upper 16 bits somewhere near the end of the structure? No, I don't think we need a 32-bit checksum for the group descriptor. Rather that the crc32c is so much faster than crc16 that it might make sense to use the low 16 bits of crc32c for the group descriptor if RO_COMPAT_METADATA_CSUM is set. > Incidentally, I have cached versions of the ext4 disk layout and metadata > checksumming wiki pages mirrored here: > http://djwong.org/docs/ext4_metadata_checksums.html > http://djwong.org/docs/ext4_disk_layout.pdf Cheers, Andreas -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Sep 15, 2011 at 09:57:33PM -0600, Andreas Dilger wrote: > True, but at the same time is there a reason _not_ to put the checksum > at the end? For the superblock in particular it seems easy to do and > simplifies the code either way. For the superblock, OK. I'll buy that and I'll make the change so that tail end of the superblock looks like this: __u32 s_overhead_blocks; /* overhead blocks/clusters in fs */ __u32 s_reserved[108]; /* Padding to the end of the block */ __u32 s_checksum; /* crc32c(superblock) */ }; - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Sep 15, 2011 at 06:50:51PM -0400, Theodore Ts'o wrote: <snip> > diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h > index 4fec5db..1c86cb2 100644 > --- a/lib/ext2fs/ext2_fs.h > +++ b/lib/ext2fs/ext2_fs.h > @@ -363,7 +370,8 @@ struct ext2_inode { > __u16 l_i_file_acl_high; > __u16 l_i_uid_high; /* these 2 fields */ > __u16 l_i_gid_high; /* were reserved2[0] */ > - __u32 l_i_reserved2; > + __u16 l_i_checksum_lo; /* crc32c(uuid+inum+inode) */ > + __u16 l_i_reserved; /* crc32c(uuid+inum+inode) */ > } linux2; > struct { > __u8 h_i_frag; /* Fragment number */ > @@ -410,7 +418,8 @@ struct ext2_inode_large { > __u16 l_i_file_acl_high; > __u16 l_i_uid_high; /* these 2 fields */ > __u16 l_i_gid_high; /* were reserved2[0] */ > - __u32 l_i_reserved2; > + __u16 l_i_checksum_lo; /* crc32c(uuid+inum+inode) */ > + __u16 l_i_reserved; > } linux2; > struct { > __u8 h_i_frag; /* Fragment number */ > @@ -422,7 +431,7 @@ struct ext2_inode_large { > } hurd2; > } osd2; /* OS dependent 2 */ > __u16 i_extra_isize; > - __u16 i_pad1; > + __u16 i_checksum_hi; /* crc32c(uuid+inum+inode) */ > __u32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ > __u32 i_mtime_extra; /* extra Modification time (nsec << 2 | epoch) */ > __u32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ > @@ -441,7 +450,7 @@ struct ext2_inode_large { > #define i_gid_low i_gid > #define i_uid_high osd2.linux2.l_i_uid_high > #define i_gid_high osd2.linux2.l_i_gid_high > -#define i_reserved2 osd2.linux2.l_i_reserved2 > +#define i_checksum osd2.linux2.l_i_checksum s/i_checksum/i_checksum_lo/, I think. --D -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Sep 15, 2011 at 06:50:51PM -0400, Theodore Ts'o wrote: > Reserve EXT4_FEATURE_RO_COMPAT_METADATA_CSUM and > EXT2_FEATURE_COMPAT_EXCLUDE_BITMAP. Also reserve fields in the > superblock and the inode for the checksums. In the block group > descriptor, reserve the exclude bitmap field for the snapshot feature, > and checksums for the inode and block allocation bitmaps. > > With this commit, the metadata checksum and exclude bitmap features > should have reserved all of the fields they need in ext4's on-disk > format. > > This commit also fixes an a missing byte swap for s_overhead_blocks. > > Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> > Cc: Darrick J. Wong <djwong@us.ibm.com> > Cc: Amir Goldstein <amir73il@gmail.com> > --- > debugfs/set_fields.c | 3 ++- > e2fsck/pass1.c | 4 ++-- > lib/e2p/feature.c | 4 ++++ > lib/e2p/ls.c | 4 ++++ > lib/ext2fs/ext2_fs.h | 31 ++++++++++++++++++++++--------- > lib/ext2fs/swapfs.c | 16 ++++++++++++++-- > lib/ext2fs/tst_inode_size.c | 3 ++- > lib/ext2fs/tst_super_size.c | 1 + > 8 files changed, 51 insertions(+), 15 deletions(-) > > diff --git a/debugfs/set_fields.c b/debugfs/set_fields.c > index ac6bc25..1a5ec7f 100644 > --- a/debugfs/set_fields.c > +++ b/debugfs/set_fields.c > @@ -144,6 +144,7 @@ static struct field_set_info super_fields[] = { > { "usr_quota_inum", &set_sb.s_usr_quota_inum, 4, parse_uint }, > { "grp_quota_inum", &set_sb.s_grp_quota_inum, 4, parse_uint }, > { "overhead_blocks", &set_sb.s_overhead_blocks, 4, parse_uint }, > + { "checksum", &set_sb.s_checksum, 4, parse_uint }, > { 0, 0, 0, 0 } > }; > > @@ -179,6 +180,7 @@ static struct field_set_info inode_fields[] = { > { "fsize", &set_inode.osd2.hurd2.h_i_fsize, 1, parse_uint }, > { "uid_high", &set_inode.osd2.linux2.l_i_uid_high, 2, parse_uint }, > { "gid_high", &set_inode.osd2.linux2.l_i_gid_high, 2, parse_uint }, > + { "checksum", &set_inode.osd2.linux2.l_i_checksum_lo, 2, parse_uint }, > { "author", &set_inode.osd2.hurd2.h_i_author, 4, parse_uint }, > { "bmap", NULL, 4, parse_bmap, FLAG_ARRAY }, > { 0, 0, 0, 0 } > @@ -192,7 +194,6 @@ static struct field_set_info ext2_bg_fields[] = { > { "free_inodes_count", &set_gd.bg_free_inodes_count, 2, parse_uint }, > { "used_dirs_count", &set_gd.bg_used_dirs_count, 2, parse_uint }, > { "flags", &set_gd.bg_flags, 2, parse_uint }, > - { "reserved", &set_gd.bg_reserved, 2, parse_uint, FLAG_ARRAY, 2 }, > { "itable_unused", &set_gd.bg_itable_unused, 2, parse_uint }, > { "checksum", &set_gd.bg_checksum, 2, parse_gd_csum }, > { 0, 0, 0, 0 } > diff --git a/e2fsck/pass1.c b/e2fsck/pass1.c > index dd18ade..16ddcda 100644 > --- a/e2fsck/pass1.c > +++ b/e2fsck/pass1.c > @@ -364,8 +364,8 @@ static void check_inode_extra_space(e2fsck_t ctx, struct problem_context *pctx) > printf("inode #%u, i_extra_size %d\n", pctx->ino, > inode->i_extra_isize); > #endif > - /* i_extra_isize must cover i_extra_isize + i_pad1 at least */ > - min = sizeof(inode->i_extra_isize) + sizeof(inode->i_pad1); > + /* i_extra_isize must cover i_extra_isize + i_checksum_hi at least */ > + min = sizeof(inode->i_extra_isize) + sizeof(inode->i_checksum_hi); > max = EXT2_INODE_SIZE(sb) - EXT2_GOOD_OLD_INODE_SIZE; > /* > * For now we will allow i_extra_isize to be 0, but really > diff --git a/lib/e2p/feature.c b/lib/e2p/feature.c > index 16fba53..965fc16 100644 > --- a/lib/e2p/feature.c > +++ b/lib/e2p/feature.c > @@ -40,6 +40,8 @@ static struct feature feature_list[] = { > "resize_inode" }, > { E2P_FEATURE_COMPAT, EXT2_FEATURE_COMPAT_LAZY_BG, > "lazy_bg" }, > + { E2P_FEATURE_COMPAT, EXT2_FEATURE_COMPAT_EXCLUDE_BITMAP, > + "snapshot" }, > > { E2P_FEATURE_RO_INCOMPAT, EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER, > "sparse_super" }, > @@ -59,6 +61,8 @@ static struct feature feature_list[] = { > "quota" }, > { E2P_FEATURE_RO_INCOMPAT, EXT4_FEATURE_RO_COMPAT_BIGALLOC, > "bigalloc"}, > + { E2P_FEATURE_RO_INCOMPAT, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM, > + "metadata_csum"}, > > { E2P_FEATURE_INCOMPAT, EXT2_FEATURE_INCOMPAT_COMPRESSION, > "compression" }, > diff --git a/lib/e2p/ls.c b/lib/e2p/ls.c > index 0f36f40..aaacdaa 100644 > --- a/lib/e2p/ls.c > +++ b/lib/e2p/ls.c > @@ -413,6 +413,10 @@ void list_super2(struct ext2_super_block * sb, FILE *f) > if (sb->s_grp_quota_inum) > fprintf(f, "Group quota inode: %u\n", > sb->s_grp_quota_inum); > + > + if (sb->s_feature_ro_compat & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) > + fprintf(f, "Checksum: 0x%08x\n", > + sb->s_checksum); > } > > void list_super (struct ext2_super_block * s) > diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h > index 4fec5db..1c86cb2 100644 > --- a/lib/ext2fs/ext2_fs.h > +++ b/lib/ext2fs/ext2_fs.h > @@ -142,7 +142,9 @@ struct ext2_group_desc > __u16 bg_free_inodes_count; /* Free inodes count */ > __u16 bg_used_dirs_count; /* Directories count */ > __u16 bg_flags; > - __u32 bg_reserved[2]; > + __u32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ > + __u16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ > + __u16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ > __u16 bg_itable_unused; /* Unused inodes count */ > __u16 bg_checksum; /* crc16(s_uuid+grouo_num+group_desc)*/ > }; > @@ -159,7 +161,9 @@ struct ext4_group_desc > __u16 bg_free_inodes_count; /* Free inodes count */ > __u16 bg_used_dirs_count; /* Directories count */ > __u16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ > - __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ > + __u32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ > + __u16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ > + __u16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ > __u16 bg_itable_unused; /* Unused inodes count */ > __u16 bg_checksum; /* crc16(sb_uuid+group+desc) */ > __u32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ > @@ -169,7 +173,10 @@ struct ext4_group_desc > __u16 bg_free_inodes_count_hi;/* Free inodes count MSB */ > __u16 bg_used_dirs_count_hi; /* Directories count MSB */ > __u16 bg_itable_unused_hi; /* Unused inodes count MSB */ > - __u32 bg_reserved2[3]; > + __u32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ > + __u16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bitmap) MSB */ > + __u16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bitmap) MSB */ > + __u32 bg_reserved; Should we attach a checksum to the exclude bitmap? That would, unfortunately, put the 64-byte block group pretty close to full, unless we're ok with making the bg even bigger... --D > }; > > #define EXT2_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not initialized */ > @@ -363,7 +370,8 @@ struct ext2_inode { > __u16 l_i_file_acl_high; > __u16 l_i_uid_high; /* these 2 fields */ > __u16 l_i_gid_high; /* were reserved2[0] */ > - __u32 l_i_reserved2; > + __u16 l_i_checksum_lo; /* crc32c(uuid+inum+inode) */ > + __u16 l_i_reserved; /* crc32c(uuid+inum+inode) */ > } linux2; > struct { > __u8 h_i_frag; /* Fragment number */ > @@ -410,7 +418,8 @@ struct ext2_inode_large { > __u16 l_i_file_acl_high; > __u16 l_i_uid_high; /* these 2 fields */ > __u16 l_i_gid_high; /* were reserved2[0] */ > - __u32 l_i_reserved2; > + __u16 l_i_checksum_lo; /* crc32c(uuid+inum+inode) */ > + __u16 l_i_reserved; > } linux2; > struct { > __u8 h_i_frag; /* Fragment number */ > @@ -422,7 +431,7 @@ struct ext2_inode_large { > } hurd2; > } osd2; /* OS dependent 2 */ > __u16 i_extra_isize; > - __u16 i_pad1; > + __u16 i_checksum_hi; /* crc32c(uuid+inum+inode) */ > __u32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ > __u32 i_mtime_extra; /* extra Modification time (nsec << 2 | epoch) */ > __u32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ > @@ -441,7 +450,7 @@ struct ext2_inode_large { > #define i_gid_low i_gid > #define i_uid_high osd2.linux2.l_i_uid_high > #define i_gid_high osd2.linux2.l_i_gid_high > -#define i_reserved2 osd2.linux2.l_i_reserved2 > +#define i_checksum osd2.linux2.l_i_checksum > #else > #if defined(__GNU__) > > @@ -623,7 +632,8 @@ struct ext2_super_block { > __u32 s_usr_quota_inum; /* inode number of user quota file */ > __u32 s_grp_quota_inum; /* inode number of group quota file */ > __u32 s_overhead_blocks; /* overhead blocks/clusters in fs */ > - __u32 s_reserved[109]; /* Padding to the end of the block */ > + __u32 s_checksum; /* crc32c(superblock) */ > + __u32 s_reserved[108]; /* Padding to the end of the block */ > }; > > #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) > @@ -671,7 +681,9 @@ struct ext2_super_block { > #define EXT2_FEATURE_COMPAT_RESIZE_INODE 0x0010 > #define EXT2_FEATURE_COMPAT_DIR_INDEX 0x0020 > #define EXT2_FEATURE_COMPAT_LAZY_BG 0x0040 > -#define EXT2_FEATURE_COMPAT_EXCLUDE_INODE 0x0080 > +/* #define EXT2_FEATURE_COMPAT_EXCLUDE_INODE 0x0080 not used, legacy */ > +#define EXT2_FEATURE_COMPAT_EXCLUDE_BITMAP 0x0100 > + > > #define EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 > #define EXT2_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 > @@ -683,6 +695,7 @@ struct ext2_super_block { > #define EXT4_FEATURE_RO_COMPAT_HAS_SNAPSHOT 0x0080 > #define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 > #define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 > +#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 > > #define EXT2_FEATURE_INCOMPAT_COMPRESSION 0x0001 > #define EXT2_FEATURE_INCOMPAT_FILETYPE 0x0002 > diff --git a/lib/ext2fs/swapfs.c b/lib/ext2fs/swapfs.c > index 87b1a2e..d1c4a56 100644 > --- a/lib/ext2fs/swapfs.c > +++ b/lib/ext2fs/swapfs.c > @@ -78,6 +78,8 @@ void ext2fs_swap_super(struct ext2_super_block * sb) > sb->s_snapshot_list = ext2fs_swab32(sb->s_snapshot_list); > sb->s_usr_quota_inum = ext2fs_swab32(sb->s_usr_quota_inum); > sb->s_grp_quota_inum = ext2fs_swab32(sb->s_grp_quota_inum); > + sb->s_overhead_blocks = ext2fs_swab32(sb->s_overhead_blocks); > + sb->s_checksum = ext2fs_swab32(sb->s_checksum); > > for (i=0; i < 4; i++) > sb->s_hash_seed[i] = ext2fs_swab32(sb->s_hash_seed[i]); > @@ -106,6 +108,11 @@ void ext2fs_swap_group_desc2(ext2_filsys fs, struct ext2_group_desc *gdp) > gdp->bg_free_inodes_count = ext2fs_swab16(gdp->bg_free_inodes_count); > gdp->bg_used_dirs_count = ext2fs_swab16(gdp->bg_used_dirs_count); > gdp->bg_flags = ext2fs_swab16(gdp->bg_flags); > + gdp->bg_exclude_bitmap_lo = ext2fs_swab32(gdp->bg_exclude_bitmap_lo); > + gdp->bg_block_bitmap_csum_lo = > + ext2fs_swab16(gdp->bg_block_bitmap_csum_lo); > + gdp->bg_inode_bitmap_csum_lo = > + ext2fs_swab16(gdp->bg_inode_bitmap_csum_lo); > gdp->bg_itable_unused = ext2fs_swab16(gdp->bg_itable_unused); > gdp->bg_checksum = ext2fs_swab16(gdp->bg_checksum); > /* If we're 32-bit, we're done */ > @@ -125,6 +132,11 @@ void ext2fs_swap_group_desc2(ext2_filsys fs, struct ext2_group_desc *gdp) > gdp4->bg_used_dirs_count_hi = > ext2fs_swab16(gdp4->bg_used_dirs_count_hi); > gdp4->bg_itable_unused_hi = ext2fs_swab16(gdp4->bg_itable_unused_hi); > + gdp->bg_exclude_bitmap_hi = ext2fs_swab16(gdp->bg_exclude_bitmap_hi); > + gdp->bg_block_bitmap_csum_hi = > + ext2fs_swab16(gdp->bg_block_bitmap_csum_hi); > + gdp->bg_inode_bitmap_csum_hi = > + ext2fs_swab16(gdp->bg_inode_bitmap_csum_hi); > } > > void ext2fs_swap_group_desc(struct ext2_group_desc *gdp) > @@ -244,8 +256,8 @@ void ext2fs_swap_inode_full(ext2_filsys fs, struct ext2_inode_large *t, > ext2fs_swab16 (f->osd2.linux2.l_i_uid_high); > t->osd2.linux2.l_i_gid_high = > ext2fs_swab16 (f->osd2.linux2.l_i_gid_high); > - t->osd2.linux2.l_i_reserved2 = > - ext2fs_swab32(f->osd2.linux2.l_i_reserved2); > + t->osd2.linux2.l_i_checksum = > + ext2fs_swab32(f->osd2.linux2.checksum); > break; > case EXT2_OS_HURD: > t->osd1.hurd1.h_i_translator = > diff --git a/lib/ext2fs/tst_inode_size.c b/lib/ext2fs/tst_inode_size.c > index 962f1cd..a4f6247 100644 > --- a/lib/ext2fs/tst_inode_size.c > +++ b/lib/ext2fs/tst_inode_size.c > @@ -61,7 +61,8 @@ void check_structure_fields() > check_field(osd2.linux2.l_i_file_acl_high); > check_field(osd2.linux2.l_i_uid_high); > check_field(osd2.linux2.l_i_gid_high); > - check_field(osd2.linux2.l_i_reserved2); > + check_field(osd2.linux2.l_i_checksum_lo); > + check_field(osd2.linux2.l_i_reserved); > printf("Ending offset is %d\n\n", cur_offset); > #endif > } > diff --git a/lib/ext2fs/tst_super_size.c b/lib/ext2fs/tst_super_size.c > index 1e5a524..75659ae 100644 > --- a/lib/ext2fs/tst_super_size.c > +++ b/lib/ext2fs/tst_super_size.c > @@ -126,6 +126,7 @@ void check_superblock_fields() > check_field(s_usr_quota_inum); > check_field(s_grp_quota_inum); > check_field(s_overhead_blocks); > + check_field(s_checksum); > check_field(s_reserved); > printf("Ending offset is %d\n\n", cur_offset); > #endif > -- > 1.7.4.1.22.gec8e1.dirty > > -- > To unsubscribe from this list: send the line "unsubscribe linux-ext4" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Sep 20, 2011 at 9:56 PM, Darrick J. Wong <djwong@us.ibm.com> wrote: > On Thu, Sep 15, 2011 at 06:50:51PM -0400, Theodore Ts'o wrote: >> Reserve EXT4_FEATURE_RO_COMPAT_METADATA_CSUM and >> EXT2_FEATURE_COMPAT_EXCLUDE_BITMAP. Also reserve fields in the >> superblock and the inode for the checksums. In the block group >> descriptor, reserve the exclude bitmap field for the snapshot feature, >> and checksums for the inode and block allocation bitmaps. >> >> With this commit, the metadata checksum and exclude bitmap features >> should have reserved all of the fields they need in ext4's on-disk >> format. >> >> This commit also fixes an a missing byte swap for s_overhead_blocks. >> >> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> >> Cc: Darrick J. Wong <djwong@us.ibm.com> >> Cc: Amir Goldstein <amir73il@gmail.com> >> --- >> debugfs/set_fields.c | 3 ++- >> e2fsck/pass1.c | 4 ++-- >> lib/e2p/feature.c | 4 ++++ >> lib/e2p/ls.c | 4 ++++ >> lib/ext2fs/ext2_fs.h | 31 ++++++++++++++++++++++--------- >> lib/ext2fs/swapfs.c | 16 ++++++++++++++-- >> lib/ext2fs/tst_inode_size.c | 3 ++- >> lib/ext2fs/tst_super_size.c | 1 + >> 8 files changed, 51 insertions(+), 15 deletions(-) >> >> diff --git a/debugfs/set_fields.c b/debugfs/set_fields.c >> index ac6bc25..1a5ec7f 100644 >> --- a/debugfs/set_fields.c >> +++ b/debugfs/set_fields.c >> @@ -144,6 +144,7 @@ static struct field_set_info super_fields[] = { >> { "usr_quota_inum", &set_sb.s_usr_quota_inum, 4, parse_uint }, >> { "grp_quota_inum", &set_sb.s_grp_quota_inum, 4, parse_uint }, >> { "overhead_blocks", &set_sb.s_overhead_blocks, 4, parse_uint }, >> + { "checksum", &set_sb.s_checksum, 4, parse_uint }, >> { 0, 0, 0, 0 } >> }; >> >> @@ -179,6 +180,7 @@ static struct field_set_info inode_fields[] = { >> { "fsize", &set_inode.osd2.hurd2.h_i_fsize, 1, parse_uint }, >> { "uid_high", &set_inode.osd2.linux2.l_i_uid_high, 2, parse_uint }, >> { "gid_high", &set_inode.osd2.linux2.l_i_gid_high, 2, parse_uint }, >> + { "checksum", &set_inode.osd2.linux2.l_i_checksum_lo, 2, parse_uint }, >> { "author", &set_inode.osd2.hurd2.h_i_author, 4, parse_uint }, >> { "bmap", NULL, 4, parse_bmap, FLAG_ARRAY }, >> { 0, 0, 0, 0 } >> @@ -192,7 +194,6 @@ static struct field_set_info ext2_bg_fields[] = { >> { "free_inodes_count", &set_gd.bg_free_inodes_count, 2, parse_uint }, >> { "used_dirs_count", &set_gd.bg_used_dirs_count, 2, parse_uint }, >> { "flags", &set_gd.bg_flags, 2, parse_uint }, >> - { "reserved", &set_gd.bg_reserved, 2, parse_uint, FLAG_ARRAY, 2 }, >> { "itable_unused", &set_gd.bg_itable_unused, 2, parse_uint }, >> { "checksum", &set_gd.bg_checksum, 2, parse_gd_csum }, >> { 0, 0, 0, 0 } >> diff --git a/e2fsck/pass1.c b/e2fsck/pass1.c >> index dd18ade..16ddcda 100644 >> --- a/e2fsck/pass1.c >> +++ b/e2fsck/pass1.c >> @@ -364,8 +364,8 @@ static void check_inode_extra_space(e2fsck_t ctx, struct problem_context *pctx) >> printf("inode #%u, i_extra_size %d\n", pctx->ino, >> inode->i_extra_isize); >> #endif >> - /* i_extra_isize must cover i_extra_isize + i_pad1 at least */ >> - min = sizeof(inode->i_extra_isize) + sizeof(inode->i_pad1); >> + /* i_extra_isize must cover i_extra_isize + i_checksum_hi at least */ >> + min = sizeof(inode->i_extra_isize) + sizeof(inode->i_checksum_hi); >> max = EXT2_INODE_SIZE(sb) - EXT2_GOOD_OLD_INODE_SIZE; >> /* >> * For now we will allow i_extra_isize to be 0, but really >> diff --git a/lib/e2p/feature.c b/lib/e2p/feature.c >> index 16fba53..965fc16 100644 >> --- a/lib/e2p/feature.c >> +++ b/lib/e2p/feature.c >> @@ -40,6 +40,8 @@ static struct feature feature_list[] = { >> "resize_inode" }, >> { E2P_FEATURE_COMPAT, EXT2_FEATURE_COMPAT_LAZY_BG, >> "lazy_bg" }, >> + { E2P_FEATURE_COMPAT, EXT2_FEATURE_COMPAT_EXCLUDE_BITMAP, >> + "snapshot" }, >> >> { E2P_FEATURE_RO_INCOMPAT, EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER, >> "sparse_super" }, >> @@ -59,6 +61,8 @@ static struct feature feature_list[] = { >> "quota" }, >> { E2P_FEATURE_RO_INCOMPAT, EXT4_FEATURE_RO_COMPAT_BIGALLOC, >> "bigalloc"}, >> + { E2P_FEATURE_RO_INCOMPAT, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM, >> + "metadata_csum"}, >> >> { E2P_FEATURE_INCOMPAT, EXT2_FEATURE_INCOMPAT_COMPRESSION, >> "compression" }, >> diff --git a/lib/e2p/ls.c b/lib/e2p/ls.c >> index 0f36f40..aaacdaa 100644 >> --- a/lib/e2p/ls.c >> +++ b/lib/e2p/ls.c >> @@ -413,6 +413,10 @@ void list_super2(struct ext2_super_block * sb, FILE *f) >> if (sb->s_grp_quota_inum) >> fprintf(f, "Group quota inode: %u\n", >> sb->s_grp_quota_inum); >> + >> + if (sb->s_feature_ro_compat & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) >> + fprintf(f, "Checksum: 0x%08x\n", >> + sb->s_checksum); >> } >> >> void list_super (struct ext2_super_block * s) >> diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h >> index 4fec5db..1c86cb2 100644 >> --- a/lib/ext2fs/ext2_fs.h >> +++ b/lib/ext2fs/ext2_fs.h >> @@ -142,7 +142,9 @@ struct ext2_group_desc >> __u16 bg_free_inodes_count; /* Free inodes count */ >> __u16 bg_used_dirs_count; /* Directories count */ >> __u16 bg_flags; >> - __u32 bg_reserved[2]; >> + __u32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ >> + __u16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ >> + __u16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ >> __u16 bg_itable_unused; /* Unused inodes count */ >> __u16 bg_checksum; /* crc16(s_uuid+grouo_num+group_desc)*/ >> }; >> @@ -159,7 +161,9 @@ struct ext4_group_desc >> __u16 bg_free_inodes_count; /* Free inodes count */ >> __u16 bg_used_dirs_count; /* Directories count */ >> __u16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ >> - __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ >> + __u32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ >> + __u16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ >> + __u16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ >> __u16 bg_itable_unused; /* Unused inodes count */ >> __u16 bg_checksum; /* crc16(sb_uuid+group+desc) */ >> __u32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ >> @@ -169,7 +173,10 @@ struct ext4_group_desc >> __u16 bg_free_inodes_count_hi;/* Free inodes count MSB */ >> __u16 bg_used_dirs_count_hi; /* Directories count MSB */ >> __u16 bg_itable_unused_hi; /* Unused inodes count MSB */ >> - __u32 bg_reserved2[3]; >> + __u32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ >> + __u16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bitmap) MSB */ >> + __u16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bitmap) MSB */ >> + __u32 bg_reserved; > > Should we attach a checksum to the exclude bitmap? That would, unfortunately, > put the 64-byte block group pretty close to full, unless we're ok with making > the bg even bigger... Good point. My initial approach (which I probably expressed somewhere) was that when exclude_bitmap feature is set, the block bitmap checksum should cover both block bitmap and exclude bitmap. The reason being that exclude bitmap adds little entropy over block bitmap: - it should always be a sub set of block bitmap bits - clearing exclude bit is done together with clearing the block used bit - the only case of modifying exclude bitmap only is when a used block becomes excluded (a.k.a moved to snapshot) so how do you feel about compressing 2 blocks into 16bits of checksum :-/? > > --D >> }; >> >> #define EXT2_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not initialized */ >> @@ -363,7 +370,8 @@ struct ext2_inode { >> __u16 l_i_file_acl_high; >> __u16 l_i_uid_high; /* these 2 fields */ >> __u16 l_i_gid_high; /* were reserved2[0] */ >> - __u32 l_i_reserved2; >> + __u16 l_i_checksum_lo; /* crc32c(uuid+inum+inode) */ >> + __u16 l_i_reserved; /* crc32c(uuid+inum+inode) */ >> } linux2; >> struct { >> __u8 h_i_frag; /* Fragment number */ >> @@ -410,7 +418,8 @@ struct ext2_inode_large { >> __u16 l_i_file_acl_high; >> __u16 l_i_uid_high; /* these 2 fields */ >> __u16 l_i_gid_high; /* were reserved2[0] */ >> - __u32 l_i_reserved2; >> + __u16 l_i_checksum_lo; /* crc32c(uuid+inum+inode) */ >> + __u16 l_i_reserved; >> } linux2; >> struct { >> __u8 h_i_frag; /* Fragment number */ >> @@ -422,7 +431,7 @@ struct ext2_inode_large { >> } hurd2; >> } osd2; /* OS dependent 2 */ >> __u16 i_extra_isize; >> - __u16 i_pad1; >> + __u16 i_checksum_hi; /* crc32c(uuid+inum+inode) */ >> __u32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ >> __u32 i_mtime_extra; /* extra Modification time (nsec << 2 | epoch) */ >> __u32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ >> @@ -441,7 +450,7 @@ struct ext2_inode_large { >> #define i_gid_low i_gid >> #define i_uid_high osd2.linux2.l_i_uid_high >> #define i_gid_high osd2.linux2.l_i_gid_high >> -#define i_reserved2 osd2.linux2.l_i_reserved2 >> +#define i_checksum osd2.linux2.l_i_checksum >> #else >> #if defined(__GNU__) >> >> @@ -623,7 +632,8 @@ struct ext2_super_block { >> __u32 s_usr_quota_inum; /* inode number of user quota file */ >> __u32 s_grp_quota_inum; /* inode number of group quota file */ >> __u32 s_overhead_blocks; /* overhead blocks/clusters in fs */ >> - __u32 s_reserved[109]; /* Padding to the end of the block */ >> + __u32 s_checksum; /* crc32c(superblock) */ >> + __u32 s_reserved[108]; /* Padding to the end of the block */ >> }; >> >> #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) >> @@ -671,7 +681,9 @@ struct ext2_super_block { >> #define EXT2_FEATURE_COMPAT_RESIZE_INODE 0x0010 >> #define EXT2_FEATURE_COMPAT_DIR_INDEX 0x0020 >> #define EXT2_FEATURE_COMPAT_LAZY_BG 0x0040 >> -#define EXT2_FEATURE_COMPAT_EXCLUDE_INODE 0x0080 >> +/* #define EXT2_FEATURE_COMPAT_EXCLUDE_INODE 0x0080 not used, legacy */ >> +#define EXT2_FEATURE_COMPAT_EXCLUDE_BITMAP 0x0100 >> + >> >> #define EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 >> #define EXT2_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 >> @@ -683,6 +695,7 @@ struct ext2_super_block { >> #define EXT4_FEATURE_RO_COMPAT_HAS_SNAPSHOT 0x0080 >> #define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 >> #define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 >> +#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 >> >> #define EXT2_FEATURE_INCOMPAT_COMPRESSION 0x0001 >> #define EXT2_FEATURE_INCOMPAT_FILETYPE 0x0002 >> diff --git a/lib/ext2fs/swapfs.c b/lib/ext2fs/swapfs.c >> index 87b1a2e..d1c4a56 100644 >> --- a/lib/ext2fs/swapfs.c >> +++ b/lib/ext2fs/swapfs.c >> @@ -78,6 +78,8 @@ void ext2fs_swap_super(struct ext2_super_block * sb) >> sb->s_snapshot_list = ext2fs_swab32(sb->s_snapshot_list); >> sb->s_usr_quota_inum = ext2fs_swab32(sb->s_usr_quota_inum); >> sb->s_grp_quota_inum = ext2fs_swab32(sb->s_grp_quota_inum); >> + sb->s_overhead_blocks = ext2fs_swab32(sb->s_overhead_blocks); >> + sb->s_checksum = ext2fs_swab32(sb->s_checksum); >> >> for (i=0; i < 4; i++) >> sb->s_hash_seed[i] = ext2fs_swab32(sb->s_hash_seed[i]); >> @@ -106,6 +108,11 @@ void ext2fs_swap_group_desc2(ext2_filsys fs, struct ext2_group_desc *gdp) >> gdp->bg_free_inodes_count = ext2fs_swab16(gdp->bg_free_inodes_count); >> gdp->bg_used_dirs_count = ext2fs_swab16(gdp->bg_used_dirs_count); >> gdp->bg_flags = ext2fs_swab16(gdp->bg_flags); >> + gdp->bg_exclude_bitmap_lo = ext2fs_swab32(gdp->bg_exclude_bitmap_lo); >> + gdp->bg_block_bitmap_csum_lo = >> + ext2fs_swab16(gdp->bg_block_bitmap_csum_lo); >> + gdp->bg_inode_bitmap_csum_lo = >> + ext2fs_swab16(gdp->bg_inode_bitmap_csum_lo); >> gdp->bg_itable_unused = ext2fs_swab16(gdp->bg_itable_unused); >> gdp->bg_checksum = ext2fs_swab16(gdp->bg_checksum); >> /* If we're 32-bit, we're done */ >> @@ -125,6 +132,11 @@ void ext2fs_swap_group_desc2(ext2_filsys fs, struct ext2_group_desc *gdp) >> gdp4->bg_used_dirs_count_hi = >> ext2fs_swab16(gdp4->bg_used_dirs_count_hi); >> gdp4->bg_itable_unused_hi = ext2fs_swab16(gdp4->bg_itable_unused_hi); >> + gdp->bg_exclude_bitmap_hi = ext2fs_swab16(gdp->bg_exclude_bitmap_hi); >> + gdp->bg_block_bitmap_csum_hi = >> + ext2fs_swab16(gdp->bg_block_bitmap_csum_hi); >> + gdp->bg_inode_bitmap_csum_hi = >> + ext2fs_swab16(gdp->bg_inode_bitmap_csum_hi); >> } >> >> void ext2fs_swap_group_desc(struct ext2_group_desc *gdp) >> @@ -244,8 +256,8 @@ void ext2fs_swap_inode_full(ext2_filsys fs, struct ext2_inode_large *t, >> ext2fs_swab16 (f->osd2.linux2.l_i_uid_high); >> t->osd2.linux2.l_i_gid_high = >> ext2fs_swab16 (f->osd2.linux2.l_i_gid_high); >> - t->osd2.linux2.l_i_reserved2 = >> - ext2fs_swab32(f->osd2.linux2.l_i_reserved2); >> + t->osd2.linux2.l_i_checksum = >> + ext2fs_swab32(f->osd2.linux2.checksum); >> break; >> case EXT2_OS_HURD: >> t->osd1.hurd1.h_i_translator = >> diff --git a/lib/ext2fs/tst_inode_size.c b/lib/ext2fs/tst_inode_size.c >> index 962f1cd..a4f6247 100644 >> --- a/lib/ext2fs/tst_inode_size.c >> +++ b/lib/ext2fs/tst_inode_size.c >> @@ -61,7 +61,8 @@ void check_structure_fields() >> check_field(osd2.linux2.l_i_file_acl_high); >> check_field(osd2.linux2.l_i_uid_high); >> check_field(osd2.linux2.l_i_gid_high); >> - check_field(osd2.linux2.l_i_reserved2); >> + check_field(osd2.linux2.l_i_checksum_lo); >> + check_field(osd2.linux2.l_i_reserved); >> printf("Ending offset is %d\n\n", cur_offset); >> #endif >> } >> diff --git a/lib/ext2fs/tst_super_size.c b/lib/ext2fs/tst_super_size.c >> index 1e5a524..75659ae 100644 >> --- a/lib/ext2fs/tst_super_size.c >> +++ b/lib/ext2fs/tst_super_size.c >> @@ -126,6 +126,7 @@ void check_superblock_fields() >> check_field(s_usr_quota_inum); >> check_field(s_grp_quota_inum); >> check_field(s_overhead_blocks); >> + check_field(s_checksum); >> check_field(s_reserved); >> printf("Ending offset is %d\n\n", cur_offset); >> #endif >> -- >> 1.7.4.1.22.gec8e1.dirty >> >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html >> > > -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Sep 21, 2011 at 11:03:57AM +0300, Amir Goldstein wrote: > On Tue, Sep 20, 2011 at 9:56 PM, Darrick J. Wong <djwong@us.ibm.com> wrote: > > On Thu, Sep 15, 2011 at 06:50:51PM -0400, Theodore Ts'o wrote: <snip> > >> diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h > >> index 4fec5db..1c86cb2 100644 > >> --- a/lib/ext2fs/ext2_fs.h > >> +++ b/lib/ext2fs/ext2_fs.h > >> @@ -142,7 +142,9 @@ struct ext2_group_desc > >> __u16 bg_free_inodes_count; /* Free inodes count */ > >> __u16 bg_used_dirs_count; /* Directories count */ > >> __u16 bg_flags; > >> - __u32 bg_reserved[2]; > >> + __u32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ > >> + __u16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ > >> + __u16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ > >> __u16 bg_itable_unused; /* Unused inodes count */ > >> __u16 bg_checksum; /* crc16(s_uuid+grouo_num+group_desc)*/ > >> }; > >> @@ -159,7 +161,9 @@ struct ext4_group_desc > >> __u16 bg_free_inodes_count; /* Free inodes count */ > >> __u16 bg_used_dirs_count; /* Directories count */ > >> __u16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ > >> - __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ > >> + __u32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ > >> + __u16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ > >> + __u16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ > >> __u16 bg_itable_unused; /* Unused inodes count */ > >> __u16 bg_checksum; /* crc16(sb_uuid+group+desc) */ > >> __u32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ > >> @@ -169,7 +173,10 @@ struct ext4_group_desc > >> __u16 bg_free_inodes_count_hi;/* Free inodes count MSB */ > >> __u16 bg_used_dirs_count_hi; /* Directories count MSB */ > >> __u16 bg_itable_unused_hi; /* Unused inodes count MSB */ > >> - __u32 bg_reserved2[3]; > >> + __u32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ > >> + __u16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bitmap) MSB */ > >> + __u16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bitmap) MSB */ > >> + __u32 bg_reserved; > > > > Should we attach a checksum to the exclude bitmap? That would, unfortunately, > > put the 64-byte block group pretty close to full, unless we're ok with making > > the bg even bigger... > > Good point. > My initial approach (which I probably expressed somewhere) was that when > exclude_bitmap feature is set, the block bitmap checksum should cover both > block bitmap and exclude bitmap. > The reason being that exclude bitmap adds little entropy over block bitmap: > - it should always be a sub set of block bitmap bits > - clearing exclude bit is done together with clearing the block used bit > - the only case of modifying exclude bitmap only is when a used block becomes > excluded (a.k.a moved to snapshot) > > so how do you feel about compressing 2 blocks into 16bits of checksum :-/? Hrm.... is it generally the case that both block and exclude bitmaps are loaded at the same time? I don't think it'd be difficult to checksum both blocks unless it's common that one of the two are not in memory. --D -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Sep 21, 2011 at 11:40 PM, Darrick J. Wong <djwong@us.ibm.com> wrote: > On Wed, Sep 21, 2011 at 11:03:57AM +0300, Amir Goldstein wrote: >> On Tue, Sep 20, 2011 at 9:56 PM, Darrick J. Wong <djwong@us.ibm.com> wrote: >> > On Thu, Sep 15, 2011 at 06:50:51PM -0400, Theodore Ts'o wrote: > <snip> >> >> diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h >> >> index 4fec5db..1c86cb2 100644 >> >> --- a/lib/ext2fs/ext2_fs.h >> >> +++ b/lib/ext2fs/ext2_fs.h >> >> @@ -142,7 +142,9 @@ struct ext2_group_desc >> >> __u16 bg_free_inodes_count; /* Free inodes count */ >> >> __u16 bg_used_dirs_count; /* Directories count */ >> >> __u16 bg_flags; >> >> - __u32 bg_reserved[2]; >> >> + __u32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ >> >> + __u16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ >> >> + __u16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ >> >> __u16 bg_itable_unused; /* Unused inodes count */ >> >> __u16 bg_checksum; /* crc16(s_uuid+grouo_num+group_desc)*/ >> >> }; >> >> @@ -159,7 +161,9 @@ struct ext4_group_desc >> >> __u16 bg_free_inodes_count; /* Free inodes count */ >> >> __u16 bg_used_dirs_count; /* Directories count */ >> >> __u16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ >> >> - __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ >> >> + __u32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ >> >> + __u16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ >> >> + __u16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ >> >> __u16 bg_itable_unused; /* Unused inodes count */ >> >> __u16 bg_checksum; /* crc16(sb_uuid+group+desc) */ >> >> __u32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ >> >> @@ -169,7 +173,10 @@ struct ext4_group_desc >> >> __u16 bg_free_inodes_count_hi;/* Free inodes count MSB */ >> >> __u16 bg_used_dirs_count_hi; /* Directories count MSB */ >> >> __u16 bg_itable_unused_hi; /* Unused inodes count MSB */ >> >> - __u32 bg_reserved2[3]; >> >> + __u32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ >> >> + __u16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bitmap) MSB */ >> >> + __u16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bitmap) MSB */ >> >> + __u32 bg_reserved; >> > >> > Should we attach a checksum to the exclude bitmap? That would, unfortunately, >> > put the 64-byte block group pretty close to full, unless we're ok with making >> > the bg even bigger... >> >> Good point. >> My initial approach (which I probably expressed somewhere) was that when >> exclude_bitmap feature is set, the block bitmap checksum should cover both >> block bitmap and exclude bitmap. >> The reason being that exclude bitmap adds little entropy over block bitmap: >> - it should always be a sub set of block bitmap bits >> - clearing exclude bit is done together with clearing the block used bit >> - the only case of modifying exclude bitmap only is when a used block becomes >> excluded (a.k.a moved to snapshot) >> >> so how do you feel about compressing 2 blocks into 16bits of checksum :-/? > > Hrm.... is it generally the case that both block and exclude bitmaps are loaded > at the same time? I don't think it'd be difficult to checksum both blocks > unless it's common that one of the two are not in memory. > Hrm indeed. no. exclude bitmap is currently not loaded always together with block bitmap, so unless this is changed, we'll have to think of something else. However that brings me to wonder: block/inode/exclude bitmap seldom change more than a handful of bits/words at a time. Does it make sense to diff update the CRC of the bitmaps instead of recomputing it? Amir. -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Sep 22, 2011 at 05:12:32AM +0300, Amir Goldstein wrote: > On Wed, Sep 21, 2011 at 11:40 PM, Darrick J. Wong <djwong@us.ibm.com> wrote: > > On Wed, Sep 21, 2011 at 11:03:57AM +0300, Amir Goldstein wrote: > >> On Tue, Sep 20, 2011 at 9:56 PM, Darrick J. Wong <djwong@us.ibm.com> wrote: > >> > On Thu, Sep 15, 2011 at 06:50:51PM -0400, Theodore Ts'o wrote: > > <snip> > >> >> diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h > >> >> index 4fec5db..1c86cb2 100644 > >> >> --- a/lib/ext2fs/ext2_fs.h > >> >> +++ b/lib/ext2fs/ext2_fs.h > >> >> @@ -142,7 +142,9 @@ struct ext2_group_desc > >> >> __u16 bg_free_inodes_count; /* Free inodes count */ > >> >> __u16 bg_used_dirs_count; /* Directories count */ > >> >> __u16 bg_flags; > >> >> - __u32 bg_reserved[2]; > >> >> + __u32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ > >> >> + __u16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ > >> >> + __u16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ > >> >> __u16 bg_itable_unused; /* Unused inodes count */ > >> >> __u16 bg_checksum; /* crc16(s_uuid+grouo_num+group_desc)*/ > >> >> }; > >> >> @@ -159,7 +161,9 @@ struct ext4_group_desc > >> >> __u16 bg_free_inodes_count; /* Free inodes count */ > >> >> __u16 bg_used_dirs_count; /* Directories count */ > >> >> __u16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ > >> >> - __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ > >> >> + __u32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ > >> >> + __u16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ > >> >> + __u16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ > >> >> __u16 bg_itable_unused; /* Unused inodes count */ > >> >> __u16 bg_checksum; /* crc16(sb_uuid+group+desc) */ > >> >> __u32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ > >> >> @@ -169,7 +173,10 @@ struct ext4_group_desc > >> >> __u16 bg_free_inodes_count_hi;/* Free inodes count MSB */ > >> >> __u16 bg_used_dirs_count_hi; /* Directories count MSB */ > >> >> __u16 bg_itable_unused_hi; /* Unused inodes count MSB */ > >> >> - __u32 bg_reserved2[3]; > >> >> + __u32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ > >> >> + __u16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bitmap) MSB */ > >> >> + __u16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bitmap) MSB */ > >> >> + __u32 bg_reserved; > >> > > >> > Should we attach a checksum to the exclude bitmap? That would, unfortunately, > >> > put the 64-byte block group pretty close to full, unless we're ok with making > >> > the bg even bigger... > >> > >> Good point. > >> My initial approach (which I probably expressed somewhere) was that when > >> exclude_bitmap feature is set, the block bitmap checksum should cover both > >> block bitmap and exclude bitmap. > >> The reason being that exclude bitmap adds little entropy over block bitmap: > >> - it should always be a sub set of block bitmap bits > >> - clearing exclude bit is done together with clearing the block used bit > >> - the only case of modifying exclude bitmap only is when a used block becomes > >> excluded (a.k.a moved to snapshot) > >> > >> so how do you feel about compressing 2 blocks into 16bits of checksum :-/? > > > > Hrm.... is it generally the case that both block and exclude bitmaps are loaded > > at the same time? I don't think it'd be difficult to checksum both blocks > > unless it's common that one of the two are not in memory. > > > > Hrm indeed. no. exclude bitmap is currently not loaded always together with > block bitmap, so unless this is changed, we'll have to think of something else. Or... does it make sense to preload the exclude bitmap at the same time as the block bitmap? If I'm reading you correctly, (block_bitmap | exclude_bitmap) yields a bitmap of all blocks that are not available, correct? So in the case that exclude bitmaps are enabled, you'd need both to be in memory anyway. > However that brings me to wonder: > block/inode/exclude bitmap seldom change more than a handful of bits/words > at a time. > Does it make sense to diff update the CRC of the bitmaps instead of > recomputing it? Yes, that would be a good (later) optimization at least to consider. --D -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Sep 22, 2011 at 10:18 PM, Darrick J. Wong <djwong@us.ibm.com> wrote: > On Thu, Sep 22, 2011 at 05:12:32AM +0300, Amir Goldstein wrote: >> On Wed, Sep 21, 2011 at 11:40 PM, Darrick J. Wong <djwong@us.ibm.com> wrote: >> > On Wed, Sep 21, 2011 at 11:03:57AM +0300, Amir Goldstein wrote: >> >> On Tue, Sep 20, 2011 at 9:56 PM, Darrick J. Wong <djwong@us.ibm.com> wrote: >> >> > On Thu, Sep 15, 2011 at 06:50:51PM -0400, Theodore Ts'o wrote: >> > <snip> >> >> >> diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h >> >> >> index 4fec5db..1c86cb2 100644 >> >> >> --- a/lib/ext2fs/ext2_fs.h >> >> >> +++ b/lib/ext2fs/ext2_fs.h >> >> >> @@ -142,7 +142,9 @@ struct ext2_group_desc >> >> >> __u16 bg_free_inodes_count; /* Free inodes count */ >> >> >> __u16 bg_used_dirs_count; /* Directories count */ >> >> >> __u16 bg_flags; >> >> >> - __u32 bg_reserved[2]; >> >> >> + __u32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ >> >> >> + __u16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ >> >> >> + __u16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ >> >> >> __u16 bg_itable_unused; /* Unused inodes count */ >> >> >> __u16 bg_checksum; /* crc16(s_uuid+grouo_num+group_desc)*/ >> >> >> }; >> >> >> @@ -159,7 +161,9 @@ struct ext4_group_desc >> >> >> __u16 bg_free_inodes_count; /* Free inodes count */ >> >> >> __u16 bg_used_dirs_count; /* Directories count */ >> >> >> __u16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ >> >> >> - __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ >> >> >> + __u32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ >> >> >> + __u16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ >> >> >> + __u16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ >> >> >> __u16 bg_itable_unused; /* Unused inodes count */ >> >> >> __u16 bg_checksum; /* crc16(sb_uuid+group+desc) */ >> >> >> __u32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ >> >> >> @@ -169,7 +173,10 @@ struct ext4_group_desc >> >> >> __u16 bg_free_inodes_count_hi;/* Free inodes count MSB */ >> >> >> __u16 bg_used_dirs_count_hi; /* Directories count MSB */ >> >> >> __u16 bg_itable_unused_hi; /* Unused inodes count MSB */ >> >> >> - __u32 bg_reserved2[3]; >> >> >> + __u32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ >> >> >> + __u16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bitmap) MSB */ >> >> >> + __u16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bitmap) MSB */ >> >> >> + __u32 bg_reserved; >> >> > >> >> > Should we attach a checksum to the exclude bitmap? That would, unfortunately, >> >> > put the 64-byte block group pretty close to full, unless we're ok with making >> >> > the bg even bigger... >> >> >> >> Good point. >> >> My initial approach (which I probably expressed somewhere) was that when >> >> exclude_bitmap feature is set, the block bitmap checksum should cover both >> >> block bitmap and exclude bitmap. >> >> The reason being that exclude bitmap adds little entropy over block bitmap: >> >> - it should always be a sub set of block bitmap bits >> >> - clearing exclude bit is done together with clearing the block used bit >> >> - the only case of modifying exclude bitmap only is when a used block becomes >> >> excluded (a.k.a moved to snapshot) >> >> >> >> so how do you feel about compressing 2 blocks into 16bits of checksum :-/? >> > >> > Hrm.... is it generally the case that both block and exclude bitmaps are loaded >> > at the same time? I don't think it'd be difficult to checksum both blocks >> > unless it's common that one of the two are not in memory. >> > >> >> Hrm indeed. no. exclude bitmap is currently not loaded always together with >> block bitmap, so unless this is changed, we'll have to think of something else. > > Or... does it make sense to preload the exclude bitmap at the same time as the > block bitmap? it wouldn't be terrible to do that, since exclude bitmap will normally be located right next to block bitmap, but if memory pressure is high, that would lead to half the number of block bitmaps in memory at any given time. > If I'm reading you correctly, (block_bitmap | exclude_bitmap) > yields a bitmap of all blocks that are not available, correct? not exactly. if all is sane, then (block_bitmap | exclude_bitmap) == block_bitmap (block_bitmap & exclude_bitmap) == exclude_bitmap > So in the case > that exclude bitmaps are enabled, you'd need both to be in memory anyway. > exclude_bitmap is loaded into memory in the following cases: 1. allocating snapshot inode blocks (during COW) 2. deleting snapshot inode blocks (snapshot delete) 3. moving "deleted" blocks to snapshot (MOW) specifically, exclude_bitmap is NOT loaded into memory in the following cases: 1. allocating regular inode blocks 2. deleting regular inode blocks, which were allocated after last snapshot For example, in kernel compilation benchmark, all temp files are created and deleted without modifying exclude bitmap (or snapshots). >> However that brings me to wonder: >> block/inode/exclude bitmap seldom change more than a handful of bits/words >> at a time. >> Does it make sense to diff update the CRC of the bitmaps instead of >> recomputing it? > > Yes, that would be a good (later) optimization at least to consider. > > --D > -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/debugfs/set_fields.c b/debugfs/set_fields.c index ac6bc25..1a5ec7f 100644 --- a/debugfs/set_fields.c +++ b/debugfs/set_fields.c @@ -144,6 +144,7 @@ static struct field_set_info super_fields[] = { { "usr_quota_inum", &set_sb.s_usr_quota_inum, 4, parse_uint }, { "grp_quota_inum", &set_sb.s_grp_quota_inum, 4, parse_uint }, { "overhead_blocks", &set_sb.s_overhead_blocks, 4, parse_uint }, + { "checksum", &set_sb.s_checksum, 4, parse_uint }, { 0, 0, 0, 0 } }; @@ -179,6 +180,7 @@ static struct field_set_info inode_fields[] = { { "fsize", &set_inode.osd2.hurd2.h_i_fsize, 1, parse_uint }, { "uid_high", &set_inode.osd2.linux2.l_i_uid_high, 2, parse_uint }, { "gid_high", &set_inode.osd2.linux2.l_i_gid_high, 2, parse_uint }, + { "checksum", &set_inode.osd2.linux2.l_i_checksum_lo, 2, parse_uint }, { "author", &set_inode.osd2.hurd2.h_i_author, 4, parse_uint }, { "bmap", NULL, 4, parse_bmap, FLAG_ARRAY }, { 0, 0, 0, 0 } @@ -192,7 +194,6 @@ static struct field_set_info ext2_bg_fields[] = { { "free_inodes_count", &set_gd.bg_free_inodes_count, 2, parse_uint }, { "used_dirs_count", &set_gd.bg_used_dirs_count, 2, parse_uint }, { "flags", &set_gd.bg_flags, 2, parse_uint }, - { "reserved", &set_gd.bg_reserved, 2, parse_uint, FLAG_ARRAY, 2 }, { "itable_unused", &set_gd.bg_itable_unused, 2, parse_uint }, { "checksum", &set_gd.bg_checksum, 2, parse_gd_csum }, { 0, 0, 0, 0 } diff --git a/e2fsck/pass1.c b/e2fsck/pass1.c index dd18ade..16ddcda 100644 --- a/e2fsck/pass1.c +++ b/e2fsck/pass1.c @@ -364,8 +364,8 @@ static void check_inode_extra_space(e2fsck_t ctx, struct problem_context *pctx) printf("inode #%u, i_extra_size %d\n", pctx->ino, inode->i_extra_isize); #endif - /* i_extra_isize must cover i_extra_isize + i_pad1 at least */ - min = sizeof(inode->i_extra_isize) + sizeof(inode->i_pad1); + /* i_extra_isize must cover i_extra_isize + i_checksum_hi at least */ + min = sizeof(inode->i_extra_isize) + sizeof(inode->i_checksum_hi); max = EXT2_INODE_SIZE(sb) - EXT2_GOOD_OLD_INODE_SIZE; /* * For now we will allow i_extra_isize to be 0, but really diff --git a/lib/e2p/feature.c b/lib/e2p/feature.c index 16fba53..965fc16 100644 --- a/lib/e2p/feature.c +++ b/lib/e2p/feature.c @@ -40,6 +40,8 @@ static struct feature feature_list[] = { "resize_inode" }, { E2P_FEATURE_COMPAT, EXT2_FEATURE_COMPAT_LAZY_BG, "lazy_bg" }, + { E2P_FEATURE_COMPAT, EXT2_FEATURE_COMPAT_EXCLUDE_BITMAP, + "snapshot" }, { E2P_FEATURE_RO_INCOMPAT, EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER, "sparse_super" }, @@ -59,6 +61,8 @@ static struct feature feature_list[] = { "quota" }, { E2P_FEATURE_RO_INCOMPAT, EXT4_FEATURE_RO_COMPAT_BIGALLOC, "bigalloc"}, + { E2P_FEATURE_RO_INCOMPAT, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM, + "metadata_csum"}, { E2P_FEATURE_INCOMPAT, EXT2_FEATURE_INCOMPAT_COMPRESSION, "compression" }, diff --git a/lib/e2p/ls.c b/lib/e2p/ls.c index 0f36f40..aaacdaa 100644 --- a/lib/e2p/ls.c +++ b/lib/e2p/ls.c @@ -413,6 +413,10 @@ void list_super2(struct ext2_super_block * sb, FILE *f) if (sb->s_grp_quota_inum) fprintf(f, "Group quota inode: %u\n", sb->s_grp_quota_inum); + + if (sb->s_feature_ro_compat & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) + fprintf(f, "Checksum: 0x%08x\n", + sb->s_checksum); } void list_super (struct ext2_super_block * s) diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h index 4fec5db..1c86cb2 100644 --- a/lib/ext2fs/ext2_fs.h +++ b/lib/ext2fs/ext2_fs.h @@ -142,7 +142,9 @@ struct ext2_group_desc __u16 bg_free_inodes_count; /* Free inodes count */ __u16 bg_used_dirs_count; /* Directories count */ __u16 bg_flags; - __u32 bg_reserved[2]; + __u32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ + __u16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ + __u16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ __u16 bg_itable_unused; /* Unused inodes count */ __u16 bg_checksum; /* crc16(s_uuid+grouo_num+group_desc)*/ }; @@ -159,7 +161,9 @@ struct ext4_group_desc __u16 bg_free_inodes_count; /* Free inodes count */ __u16 bg_used_dirs_count; /* Directories count */ __u16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ - __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ + __u32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ + __u16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ + __u16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */ __u16 bg_itable_unused; /* Unused inodes count */ __u16 bg_checksum; /* crc16(sb_uuid+group+desc) */ __u32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ @@ -169,7 +173,10 @@ struct ext4_group_desc __u16 bg_free_inodes_count_hi;/* Free inodes count MSB */ __u16 bg_used_dirs_count_hi; /* Directories count MSB */ __u16 bg_itable_unused_hi; /* Unused inodes count MSB */ - __u32 bg_reserved2[3]; + __u32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ + __u16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bitmap) MSB */ + __u16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bitmap) MSB */ + __u32 bg_reserved; }; #define EXT2_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not initialized */ @@ -363,7 +370,8 @@ struct ext2_inode { __u16 l_i_file_acl_high; __u16 l_i_uid_high; /* these 2 fields */ __u16 l_i_gid_high; /* were reserved2[0] */ - __u32 l_i_reserved2; + __u16 l_i_checksum_lo; /* crc32c(uuid+inum+inode) */ + __u16 l_i_reserved; /* crc32c(uuid+inum+inode) */ } linux2; struct { __u8 h_i_frag; /* Fragment number */ @@ -410,7 +418,8 @@ struct ext2_inode_large { __u16 l_i_file_acl_high; __u16 l_i_uid_high; /* these 2 fields */ __u16 l_i_gid_high; /* were reserved2[0] */ - __u32 l_i_reserved2; + __u16 l_i_checksum_lo; /* crc32c(uuid+inum+inode) */ + __u16 l_i_reserved; } linux2; struct { __u8 h_i_frag; /* Fragment number */ @@ -422,7 +431,7 @@ struct ext2_inode_large { } hurd2; } osd2; /* OS dependent 2 */ __u16 i_extra_isize; - __u16 i_pad1; + __u16 i_checksum_hi; /* crc32c(uuid+inum+inode) */ __u32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ __u32 i_mtime_extra; /* extra Modification time (nsec << 2 | epoch) */ __u32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ @@ -441,7 +450,7 @@ struct ext2_inode_large { #define i_gid_low i_gid #define i_uid_high osd2.linux2.l_i_uid_high #define i_gid_high osd2.linux2.l_i_gid_high -#define i_reserved2 osd2.linux2.l_i_reserved2 +#define i_checksum osd2.linux2.l_i_checksum #else #if defined(__GNU__) @@ -623,7 +632,8 @@ struct ext2_super_block { __u32 s_usr_quota_inum; /* inode number of user quota file */ __u32 s_grp_quota_inum; /* inode number of group quota file */ __u32 s_overhead_blocks; /* overhead blocks/clusters in fs */ - __u32 s_reserved[109]; /* Padding to the end of the block */ + __u32 s_checksum; /* crc32c(superblock) */ + __u32 s_reserved[108]; /* Padding to the end of the block */ }; #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) @@ -671,7 +681,9 @@ struct ext2_super_block { #define EXT2_FEATURE_COMPAT_RESIZE_INODE 0x0010 #define EXT2_FEATURE_COMPAT_DIR_INDEX 0x0020 #define EXT2_FEATURE_COMPAT_LAZY_BG 0x0040 -#define EXT2_FEATURE_COMPAT_EXCLUDE_INODE 0x0080 +/* #define EXT2_FEATURE_COMPAT_EXCLUDE_INODE 0x0080 not used, legacy */ +#define EXT2_FEATURE_COMPAT_EXCLUDE_BITMAP 0x0100 + #define EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 #define EXT2_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 @@ -683,6 +695,7 @@ struct ext2_super_block { #define EXT4_FEATURE_RO_COMPAT_HAS_SNAPSHOT 0x0080 #define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 #define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 #define EXT2_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT2_FEATURE_INCOMPAT_FILETYPE 0x0002 diff --git a/lib/ext2fs/swapfs.c b/lib/ext2fs/swapfs.c index 87b1a2e..d1c4a56 100644 --- a/lib/ext2fs/swapfs.c +++ b/lib/ext2fs/swapfs.c @@ -78,6 +78,8 @@ void ext2fs_swap_super(struct ext2_super_block * sb) sb->s_snapshot_list = ext2fs_swab32(sb->s_snapshot_list); sb->s_usr_quota_inum = ext2fs_swab32(sb->s_usr_quota_inum); sb->s_grp_quota_inum = ext2fs_swab32(sb->s_grp_quota_inum); + sb->s_overhead_blocks = ext2fs_swab32(sb->s_overhead_blocks); + sb->s_checksum = ext2fs_swab32(sb->s_checksum); for (i=0; i < 4; i++) sb->s_hash_seed[i] = ext2fs_swab32(sb->s_hash_seed[i]); @@ -106,6 +108,11 @@ void ext2fs_swap_group_desc2(ext2_filsys fs, struct ext2_group_desc *gdp) gdp->bg_free_inodes_count = ext2fs_swab16(gdp->bg_free_inodes_count); gdp->bg_used_dirs_count = ext2fs_swab16(gdp->bg_used_dirs_count); gdp->bg_flags = ext2fs_swab16(gdp->bg_flags); + gdp->bg_exclude_bitmap_lo = ext2fs_swab32(gdp->bg_exclude_bitmap_lo); + gdp->bg_block_bitmap_csum_lo = + ext2fs_swab16(gdp->bg_block_bitmap_csum_lo); + gdp->bg_inode_bitmap_csum_lo = + ext2fs_swab16(gdp->bg_inode_bitmap_csum_lo); gdp->bg_itable_unused = ext2fs_swab16(gdp->bg_itable_unused); gdp->bg_checksum = ext2fs_swab16(gdp->bg_checksum); /* If we're 32-bit, we're done */ @@ -125,6 +132,11 @@ void ext2fs_swap_group_desc2(ext2_filsys fs, struct ext2_group_desc *gdp) gdp4->bg_used_dirs_count_hi = ext2fs_swab16(gdp4->bg_used_dirs_count_hi); gdp4->bg_itable_unused_hi = ext2fs_swab16(gdp4->bg_itable_unused_hi); + gdp->bg_exclude_bitmap_hi = ext2fs_swab16(gdp->bg_exclude_bitmap_hi); + gdp->bg_block_bitmap_csum_hi = + ext2fs_swab16(gdp->bg_block_bitmap_csum_hi); + gdp->bg_inode_bitmap_csum_hi = + ext2fs_swab16(gdp->bg_inode_bitmap_csum_hi); } void ext2fs_swap_group_desc(struct ext2_group_desc *gdp) @@ -244,8 +256,8 @@ void ext2fs_swap_inode_full(ext2_filsys fs, struct ext2_inode_large *t, ext2fs_swab16 (f->osd2.linux2.l_i_uid_high); t->osd2.linux2.l_i_gid_high = ext2fs_swab16 (f->osd2.linux2.l_i_gid_high); - t->osd2.linux2.l_i_reserved2 = - ext2fs_swab32(f->osd2.linux2.l_i_reserved2); + t->osd2.linux2.l_i_checksum = + ext2fs_swab32(f->osd2.linux2.checksum); break; case EXT2_OS_HURD: t->osd1.hurd1.h_i_translator = diff --git a/lib/ext2fs/tst_inode_size.c b/lib/ext2fs/tst_inode_size.c index 962f1cd..a4f6247 100644 --- a/lib/ext2fs/tst_inode_size.c +++ b/lib/ext2fs/tst_inode_size.c @@ -61,7 +61,8 @@ void check_structure_fields() check_field(osd2.linux2.l_i_file_acl_high); check_field(osd2.linux2.l_i_uid_high); check_field(osd2.linux2.l_i_gid_high); - check_field(osd2.linux2.l_i_reserved2); + check_field(osd2.linux2.l_i_checksum_lo); + check_field(osd2.linux2.l_i_reserved); printf("Ending offset is %d\n\n", cur_offset); #endif } diff --git a/lib/ext2fs/tst_super_size.c b/lib/ext2fs/tst_super_size.c index 1e5a524..75659ae 100644 --- a/lib/ext2fs/tst_super_size.c +++ b/lib/ext2fs/tst_super_size.c @@ -126,6 +126,7 @@ void check_superblock_fields() check_field(s_usr_quota_inum); check_field(s_grp_quota_inum); check_field(s_overhead_blocks); + check_field(s_checksum); check_field(s_reserved); printf("Ending offset is %d\n\n", cur_offset); #endif
Reserve EXT4_FEATURE_RO_COMPAT_METADATA_CSUM and EXT2_FEATURE_COMPAT_EXCLUDE_BITMAP. Also reserve fields in the superblock and the inode for the checksums. In the block group descriptor, reserve the exclude bitmap field for the snapshot feature, and checksums for the inode and block allocation bitmaps. With this commit, the metadata checksum and exclude bitmap features should have reserved all of the fields they need in ext4's on-disk format. This commit also fixes an a missing byte swap for s_overhead_blocks. Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Cc: Darrick J. Wong <djwong@us.ibm.com> Cc: Amir Goldstein <amir73il@gmail.com> --- debugfs/set_fields.c | 3 ++- e2fsck/pass1.c | 4 ++-- lib/e2p/feature.c | 4 ++++ lib/e2p/ls.c | 4 ++++ lib/ext2fs/ext2_fs.h | 31 ++++++++++++++++++++++--------- lib/ext2fs/swapfs.c | 16 ++++++++++++++-- lib/ext2fs/tst_inode_size.c | 3 ++- lib/ext2fs/tst_super_size.c | 1 + 8 files changed, 51 insertions(+), 15 deletions(-)