diff mbox

[08/16] ext4: Calculate and verify checksums for inode bitmaps

Message ID 20110901003127.31048.13983.stgit@elm3c44.beaverton.ibm.com
State Superseded, archived
Headers show

Commit Message

Darrick J. Wong Sept. 1, 2011, 12:31 a.m. UTC
Compute and verify the checksum of the inode bitmap; the checkum is stored in
the block group descriptor.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
---
 fs/ext4/ext4.h   |    3 ++-
 fs/ext4/ialloc.c |   33 ++++++++++++++++++++++++++++++---
 2 files changed, 32 insertions(+), 4 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Andreas Dilger Sept. 1, 2011, 4:49 a.m. UTC | #1
On 2011-08-31, at 6:31 PM, Darrick J. Wong wrote:
> Compute and verify the checksum of the inode bitmap; the checkum is stored in
> the block group descriptor.
> 
> Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
> ---
> fs/ext4/ext4.h   |    3 ++-
> fs/ext4/ialloc.c |   33 ++++++++++++++++++++++++++++++---
> 2 files changed, 32 insertions(+), 4 deletions(-)
> 
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index bc7ace1..248cbd2 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -279,7 +279,8 @@ struct ext4_group_desc
> 	__le16	bg_free_inodes_count_hi;/* Free inodes count MSB */
> 	__le16	bg_used_dirs_count_hi;	/* Directories count MSB */
> 	__le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
> -	__u32	bg_reserved2[3];
> +	__le32	bg_inode_bitmap_csum;	/* crc32c(uuid+group+ibitmap) */
> +	__u32	bg_reserved2[2];
> };

I would prefer if there was a 16-bit checksum for the (most common)
32-byte group descriptors, and this was extended to a 32-bit checksum
for the (much less common) 64-byte+ group descriptors.  For filesystems
that are newly formatted with the 64bit feature it makes no difference,
but virtually all ext3/4 filesystems have only the smaller group descriptors.

Regardless of whether using half of the crc32c is better or worse than
using crc16 for the bitmap blocks, storing _any_ checksum is better than
storing nothing at all.  I would propose the following:

struct ext4_group_desc
{
        __le32 bg_block_bitmap_lo;	/* Blocks bitmap block */
        __le32 bg_inode_bitmap_lo;	/* Inodes bitmap block */
        __le32 bg_inode_table_lo;	/* Inodes table block */
        __le16 bg_free_blocks_count_lo;	/* Free blocks count */
        __le16 bg_free_inodes_count_lo;	/* Free inodes count */
        __le16 bg_used_dirs_count_lo;	/* Directories count */
        __le16 bg_flags;		/* EXT4_BG_flags (INODE_UNINIT, etc) */
        __le32 bg_exclude_bitmap_lo;	/* Exclude bitmap block */
        __le16 bg_block_bitmap_csum_lo;	/* Block bitmap checksum */
	__le16 bg_inode_bitmap_csum_lo;	/* Inode bitmap checksum */
        __le16 bg_itable_unused_lo;	/* Unused inodes count */
        __le16 bg_checksum;		/* crc16(sb_uuid+group+desc) */
        __le32 bg_block_bitmap_hi;	/* Blocks bitmap block MSB */
        __le32 bg_inode_bitmap_hi;	/* Inodes bitmap block MSB */
        __le32 bg_inode_table_hi;	/* Inodes table block MSB */
        __le16 bg_free_blocks_count_hi;	/* Free blocks count MSB */
        __le16 bg_free_inodes_count_hi;	/* Free inodes count MSB */
        __le16 bg_used_dirs_count_hi;	/* Directories count MSB */
        __le16 bg_itable_unused_hi;	/* Unused inodes count MSB */
	__le32 bg_exclude_bitmap_hi;	/* Exclude bitmap block MSB */
	__le16 bg_block_bitmap_csum_hi;	/* Blocks bitmap checksum MSB */
	__le16 bg_inode_bitmap_csum_hi;	/* Inodes bitmap checksum MSB */
        __le32 bg_reserved2;
};

This is also different from your layout because it locates the block bitmap
checksum field before the inode bitmap checksum, to more closely match the
order of other fields in this structure.

> /*
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index 9c63f27..53faffc 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -82,12 +82,18 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
> 		ext4_free_inodes_set(sb, gdp, 0);
> 		ext4_itable_unused_set(sb, gdp, 0);
> 		memset(bh->b_data, 0xff, sb->s_blocksize);
> +		ext4_bitmap_csum_set(sb, block_group,
> +				     &gdp->bg_inode_bitmap_csum, bh,
> +				     (EXT4_INODES_PER_GROUP(sb) + 7) / 8);

The number of inodes per group is already always a multiple of 8.

> 		return 0;
> 	}
> 
> 	memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
> 	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
> 			bh->b_data);
> +	ext4_bitmap_csum_set(sb, block_group, &gdp->bg_inode_bitmap_csum, bh,
> +			     (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
> +	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
> 
> 	return EXT4_INODES_PER_GROUP(sb);
> }
> @@ -118,12 +124,12 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
> 		return NULL;
> 	}
> 	if (bitmap_uptodate(bh))
> -		return bh;
> +		goto verify;
> 
> 	lock_buffer(bh);
> 	if (bitmap_uptodate(bh)) {
> 		unlock_buffer(bh);
> -		return bh;
> +		goto verify;
> 	}
> 
> 	ext4_lock_group(sb, block_group);
> @@ -131,6 +137,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
> 		ext4_init_inode_bitmap(sb, bh, block_group, desc);
> 		set_bitmap_uptodate(bh);
> 		set_buffer_uptodate(bh);
> +		set_buffer_verified(bh);
> 		ext4_unlock_group(sb, block_group);
> 		unlock_buffer(bh);
> 		return bh;
> @@ -144,7 +151,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
> 		 */
> 		set_bitmap_uptodate(bh);
> 		unlock_buffer(bh);
> -		return bh;
> +		goto verify;
> 	}
> 	/*
> 	 * submit the buffer_head for read. We can
> @@ -161,6 +168,21 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
> 			    block_group, bitmap_blk);
> 		return NULL;
> 	}
> +
> +verify:
> +	ext4_lock_group(sb, block_group);
> +	if (!buffer_verified(bh) &&
> +	    !ext4_bitmap_csum_verify(sb, block_group,
> +				     desc->bg_inode_bitmap_csum, bh,
> +				     (EXT4_INODES_PER_GROUP(sb) + 7) / 8)) {
> +		ext4_unlock_group(sb, block_group);
> +		put_bh(bh);
> +		ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
> +			   "inode_bitmap = %llu", block_group, bitmap_blk);
> +		return NULL;

At some point we should add a flag like EXT4_BG_INODE_ERROR so that the
group can be marked in error on disk, and skipped for future allocations,
but the whole filesystem does not need to be remounted read-only.  That's
for another patch, however.

> +	}
> +	ext4_unlock_group(sb, block_group);
> +	set_buffer_verified(bh);
> 	return bh;
> }
> 
> @@ -265,6 +287,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
> 		ext4_used_dirs_set(sb, gdp, count);
> 		percpu_counter_dec(&sbi->s_dirs_counter);
> 	}
> +	ext4_bitmap_csum_set(sb, block_group, &gdp->bg_inode_bitmap_csum,
> +			     bitmap_bh, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
> 	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
> 	ext4_unlock_group(sb, block_group);
> 
> @@ -784,6 +808,9 @@ static int ext4_claim_inode(struct super_block *sb,
> 			atomic_inc(&sbi->s_flex_groups[f].used_dirs);
> 		}
> 	}
> +	ext4_bitmap_csum_set(sb, group, &gdp->bg_inode_bitmap_csum,
> +			     inode_bitmap_bh,
> +			     (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
> 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
> err_ret:
> 	ext4_unlock_group(sb, group);
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Sept. 2, 2011, 7:18 p.m. UTC | #2
On Wed, Aug 31, 2011 at 10:49:05PM -0600, Andreas Dilger wrote:
> On 2011-08-31, at 6:31 PM, Darrick J. Wong wrote:
> > Compute and verify the checksum of the inode bitmap; the checkum is stored in
> > the block group descriptor.
> > 
> > Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
> > ---
> > fs/ext4/ext4.h   |    3 ++-
> > fs/ext4/ialloc.c |   33 ++++++++++++++++++++++++++++++---
> > 2 files changed, 32 insertions(+), 4 deletions(-)
> > 
> > 
> > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > index bc7ace1..248cbd2 100644
> > --- a/fs/ext4/ext4.h
> > +++ b/fs/ext4/ext4.h
> > @@ -279,7 +279,8 @@ struct ext4_group_desc
> > 	__le16	bg_free_inodes_count_hi;/* Free inodes count MSB */
> > 	__le16	bg_used_dirs_count_hi;	/* Directories count MSB */
> > 	__le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
> > -	__u32	bg_reserved2[3];
> > +	__le32	bg_inode_bitmap_csum;	/* crc32c(uuid+group+ibitmap) */
> > +	__u32	bg_reserved2[2];
> > };
> 
> I would prefer if there was a 16-bit checksum for the (most common)
> 32-byte group descriptors, and this was extended to a 32-bit checksum
> for the (much less common) 64-byte+ group descriptors.  For filesystems
> that are newly formatted with the 64bit feature it makes no difference,
> but virtually all ext3/4 filesystems have only the smaller group descriptors.
> 
> Regardless of whether using half of the crc32c is better or worse than
> using crc16 for the bitmap blocks, storing _any_ checksum is better than
> storing nothing at all.  I would propose the following:

That's an interesting reframing of the argument that I hadn't considered.  I'd
fallen into the idea of needing crc32c because of its bit error guarantees (all
corruptions of odd numbers of bits and all corruptions of fewer than ...4?
bits) that I hadn't quite realized that even if crc16 can't guarantee to find
any corruption at all, it still _might_, and that's better than nothing.

Ok, let's split the 32-bit fields and use crc16 for the case of 32-byte block
group descriptors.

> struct ext4_group_desc
> {
>         __le32 bg_block_bitmap_lo;	/* Blocks bitmap block */
>         __le32 bg_inode_bitmap_lo;	/* Inodes bitmap block */
>         __le32 bg_inode_table_lo;	/* Inodes table block */
>         __le16 bg_free_blocks_count_lo;	/* Free blocks count */
>         __le16 bg_free_inodes_count_lo;	/* Free inodes count */
>         __le16 bg_used_dirs_count_lo;	/* Directories count */
>         __le16 bg_flags;		/* EXT4_BG_flags (INODE_UNINIT, etc) */
>         __le32 bg_exclude_bitmap_lo;	/* Exclude bitmap block */
>         __le16 bg_block_bitmap_csum_lo;	/* Block bitmap checksum */
> 	__le16 bg_inode_bitmap_csum_lo;	/* Inode bitmap checksum */
>         __le16 bg_itable_unused_lo;	/* Unused inodes count */
>         __le16 bg_checksum;		/* crc16(sb_uuid+group+desc) */
>         __le32 bg_block_bitmap_hi;	/* Blocks bitmap block MSB */
>         __le32 bg_inode_bitmap_hi;	/* Inodes bitmap block MSB */
>         __le32 bg_inode_table_hi;	/* Inodes table block MSB */
>         __le16 bg_free_blocks_count_hi;	/* Free blocks count MSB */
>         __le16 bg_free_inodes_count_hi;	/* Free inodes count MSB */
>         __le16 bg_used_dirs_count_hi;	/* Directories count MSB */
>         __le16 bg_itable_unused_hi;	/* Unused inodes count MSB */
> 	__le32 bg_exclude_bitmap_hi;	/* Exclude bitmap block MSB */
> 	__le16 bg_block_bitmap_csum_hi;	/* Blocks bitmap checksum MSB */
> 	__le16 bg_inode_bitmap_csum_hi;	/* Inodes bitmap checksum MSB */
>         __le32 bg_reserved2;
> };
> 
> This is also different from your layout because it locates the block bitmap
> checksum field before the inode bitmap checksum, to more closely match the
> order of other fields in this structure.

Er.. I reversed the order in the structure definition just prior to publishing,
and forgot to update the wiki page.  Well I guess I'm about to update it again.
:)

> > /*
> > diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> > index 9c63f27..53faffc 100644
> > --- a/fs/ext4/ialloc.c
> > +++ b/fs/ext4/ialloc.c
> > @@ -82,12 +82,18 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
> > 		ext4_free_inodes_set(sb, gdp, 0);
> > 		ext4_itable_unused_set(sb, gdp, 0);
> > 		memset(bh->b_data, 0xff, sb->s_blocksize);
> > +		ext4_bitmap_csum_set(sb, block_group,
> > +				     &gdp->bg_inode_bitmap_csum, bh,
> > +				     (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
> 
> The number of inodes per group is already always a multiple of 8.

Ok.  I suppose we can fix that in the lines below too.

> > 		return 0;
> > 	}
> > 
> > 	memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
> > 	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
> > 			bh->b_data);
> > +	ext4_bitmap_csum_set(sb, block_group, &gdp->bg_inode_bitmap_csum, bh,
> > +			     (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
> > +	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
> > 
> > 	return EXT4_INODES_PER_GROUP(sb);
> > }
> > @@ -118,12 +124,12 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
> > 		return NULL;
> > 	}
> > 	if (bitmap_uptodate(bh))
> > -		return bh;
> > +		goto verify;
> > 
> > 	lock_buffer(bh);
> > 	if (bitmap_uptodate(bh)) {
> > 		unlock_buffer(bh);
> > -		return bh;
> > +		goto verify;
> > 	}
> > 
> > 	ext4_lock_group(sb, block_group);
> > @@ -131,6 +137,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
> > 		ext4_init_inode_bitmap(sb, bh, block_group, desc);
> > 		set_bitmap_uptodate(bh);
> > 		set_buffer_uptodate(bh);
> > +		set_buffer_verified(bh);
> > 		ext4_unlock_group(sb, block_group);
> > 		unlock_buffer(bh);
> > 		return bh;
> > @@ -144,7 +151,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
> > 		 */
> > 		set_bitmap_uptodate(bh);
> > 		unlock_buffer(bh);
> > -		return bh;
> > +		goto verify;
> > 	}
> > 	/*
> > 	 * submit the buffer_head for read. We can
> > @@ -161,6 +168,21 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
> > 			    block_group, bitmap_blk);
> > 		return NULL;
> > 	}
> > +
> > +verify:
> > +	ext4_lock_group(sb, block_group);
> > +	if (!buffer_verified(bh) &&
> > +	    !ext4_bitmap_csum_verify(sb, block_group,
> > +				     desc->bg_inode_bitmap_csum, bh,
> > +				     (EXT4_INODES_PER_GROUP(sb) + 7) / 8)) {
> > +		ext4_unlock_group(sb, block_group);
> > +		put_bh(bh);
> > +		ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
> > +			   "inode_bitmap = %llu", block_group, bitmap_blk);
> > +		return NULL;
> 
> At some point we should add a flag like EXT4_BG_INODE_ERROR so that the
> group can be marked in error on disk, and skipped for future allocations,
> but the whole filesystem does not need to be remounted read-only.  That's
> for another patch, however.

Agreed. :)

--D

> > +	}
> > +	ext4_unlock_group(sb, block_group);
> > +	set_buffer_verified(bh);
> > 	return bh;
> > }
> > 
> > @@ -265,6 +287,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
> > 		ext4_used_dirs_set(sb, gdp, count);
> > 		percpu_counter_dec(&sbi->s_dirs_counter);
> > 	}
> > +	ext4_bitmap_csum_set(sb, block_group, &gdp->bg_inode_bitmap_csum,
> > +			     bitmap_bh, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
> > 	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
> > 	ext4_unlock_group(sb, block_group);
> > 
> > @@ -784,6 +808,9 @@ static int ext4_claim_inode(struct super_block *sb,
> > 			atomic_inc(&sbi->s_flex_groups[f].used_dirs);
> > 		}
> > 	}
> > +	ext4_bitmap_csum_set(sb, group, &gdp->bg_inode_bitmap_csum,
> > +			     inode_bitmap_bh,
> > +			     (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
> > 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
> > err_ret:
> > 	ext4_unlock_group(sb, group);
> > 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger Sept. 2, 2011, 9:27 p.m. UTC | #3
On 2011-09-02, at 1:18 PM, Darrick J. Wong wrote:
> On Wed, Aug 31, 2011 at 10:49:05PM -0600, Andreas Dilger wrote:
>> On 2011-08-31, at 6:31 PM, Darrick J. Wong wrote:
>>> Compute and verify the checksum of the inode bitmap; the checkum is stored in
>>> the block group descriptor.
>> 
>> I would prefer if there was a 16-bit checksum for the (most common)
>> 32-byte group descriptors, and this was extended to a 32-bit checksum
>> for the (much less common) 64-byte+ group descriptors.  For filesystems
>> that are newly formatted with the 64bit feature it makes no difference,
>> but virtually all ext3/4 filesystems have only the smaller group descriptors.
>> 
>> Regardless of whether using half of the crc32c is better or worse than
>> using crc16 for the bitmap blocks, storing _any_ checksum is better than
>> storing nothing at all.  I would propose the following:
> 
> That's an interesting reframing of the argument that I hadn't considered.
> I'd fallen into the idea of needing crc32c because of its bit error
> guarantees (all corruptions of odd numbers of bits and all corruptions of
> fewer than ...4? bits) that I hadn't quite realized that even if crc16
> can't guarantee to find any corruption at all, it still _might_, and that's
> better than nothing.
> 
> Ok, let's split the 32-bit fields and use crc16 for the case of 32-byte block
> group descriptors.

I noticed the crc16 calculation is actually _slower_ than crc32c,
probably because the CPU cannot use 32-bit values when computing the
result, so it has to do a lot of word masking, per your table at
https://ext4.wiki.kernel.org/index.php/Ext4_Metadata_Checksums.
Also, there is the question of whether computing two different
checksums is needlessly complicating the code, or if it is easier
to just compute crc32c all the time and only make the storing of
the high 16 bits conditional.

What I'm suggesting is always computing the crc32c, but for filesystems
that are not formatted with the 64bit option just store the low 16 bits
of the crc32c value into bg_{block,inode}_bitmap_csum_lo.  This is much
better than not computing a checksum here at all.  The only open question
is whether 1/2 of crc32c is substantially worse at detecting errors than
crc16 or not?


I was also thinking whether the EXT4_FEATURE_RO_COMPAT_METADATA_CSUM
feature should also cause the bg_checksum to do the same (store only
low 16 bits of crc32c) just for the improved speed?

It might be interesting to redo the table that you computed, but
using a loop that is only computing the checksums for small blocks
of data (e.g. 32 bytes and 4096 bytes in a loop for a total of 512MB)
to see what the overhead of the cryptoapi and hardware calls are.

>> struct ext4_group_desc
>> {
>>        __le32 bg_block_bitmap_lo;	/* Blocks bitmap block */
>>        __le32 bg_inode_bitmap_lo;	/* Inodes bitmap block */
>>        __le32 bg_inode_table_lo;	/* Inodes table block */
>>        __le16 bg_free_blocks_count_lo;	/* Free blocks count */
>>        __le16 bg_free_inodes_count_lo;	/* Free inodes count */
>>        __le16 bg_used_dirs_count_lo;	/* Directories count */
>>        __le16 bg_flags;		/* EXT4_BG_flags (INODE_UNINIT, etc) */
>>        __le32 bg_exclude_bitmap_lo;	/* Exclude bitmap block */
>>        __le16 bg_block_bitmap_csum_lo;	/* Block bitmap checksum */
>> 	__le16 bg_inode_bitmap_csum_lo;	/* Inode bitmap checksum */
>>        __le16 bg_itable_unused_lo;	/* Unused inodes count */
>>        __le16 bg_checksum;		/* crc16(sb_uuid+group+desc) */
>>        __le32 bg_block_bitmap_hi;	/* Blocks bitmap block MSB */
>>        __le32 bg_inode_bitmap_hi;	/* Inodes bitmap block MSB */
>>        __le32 bg_inode_table_hi;	/* Inodes table block MSB */
>>        __le16 bg_free_blocks_count_hi;	/* Free blocks count MSB */
>>        __le16 bg_free_inodes_count_hi;	/* Free inodes count MSB */
>>        __le16 bg_used_dirs_count_hi;	/* Directories count MSB */
>>        __le16 bg_itable_unused_hi;	/* Unused inodes count MSB */
>> 	__le32 bg_exclude_bitmap_hi;	/* Exclude bitmap block MSB */
>> 	__le16 bg_block_bitmap_csum_hi;	/* Blocks bitmap checksum MSB */
>> 	__le16 bg_inode_bitmap_csum_hi;	/* Inodes bitmap checksum MSB */
>>        __le32 bg_reserved2;
>> };
>> 
>> This is also different from your layout because it locates the block bitmap
>> checksum field before the inode bitmap checksum, to more closely match the
>> order of other fields in this structure.
> 
> Er.. I reversed the order in the structure definition just prior to publishing,
> and forgot to update the wiki page.  Well I guess I'm about to update it again.
> :)
> 
>>> /*
>>> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
>>> index 9c63f27..53faffc 100644
>>> --- a/fs/ext4/ialloc.c
>>> +++ b/fs/ext4/ialloc.c
>>> @@ -82,12 +82,18 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
>>> 		ext4_free_inodes_set(sb, gdp, 0);
>>> 		ext4_itable_unused_set(sb, gdp, 0);
>>> 		memset(bh->b_data, 0xff, sb->s_blocksize);
>>> +		ext4_bitmap_csum_set(sb, block_group,
>>> +				     &gdp->bg_inode_bitmap_csum, bh,
>>> +				     (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
>> 
>> The number of inodes per group is already always a multiple of 8.
> 
> Ok.  I suppose we can fix that in the lines below too.
> 
>>> 		return 0;
>>> 	}
>>> 
>>> 	memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
>>> 	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
>>> 			bh->b_data);
>>> +	ext4_bitmap_csum_set(sb, block_group, &gdp->bg_inode_bitmap_csum, bh,
>>> +			     (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
>>> +	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
>>> 
>>> 	return EXT4_INODES_PER_GROUP(sb);
>>> }
>>> @@ -118,12 +124,12 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
>>> 		return NULL;
>>> 	}
>>> 	if (bitmap_uptodate(bh))
>>> -		return bh;
>>> +		goto verify;
>>> 
>>> 	lock_buffer(bh);
>>> 	if (bitmap_uptodate(bh)) {
>>> 		unlock_buffer(bh);
>>> -		return bh;
>>> +		goto verify;
>>> 	}
>>> 
>>> 	ext4_lock_group(sb, block_group);
>>> @@ -131,6 +137,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
>>> 		ext4_init_inode_bitmap(sb, bh, block_group, desc);
>>> 		set_bitmap_uptodate(bh);
>>> 		set_buffer_uptodate(bh);
>>> +		set_buffer_verified(bh);
>>> 		ext4_unlock_group(sb, block_group);
>>> 		unlock_buffer(bh);
>>> 		return bh;
>>> @@ -144,7 +151,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
>>> 		 */
>>> 		set_bitmap_uptodate(bh);
>>> 		unlock_buffer(bh);
>>> -		return bh;
>>> +		goto verify;
>>> 	}
>>> 	/*
>>> 	 * submit the buffer_head for read. We can
>>> @@ -161,6 +168,21 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
>>> 			    block_group, bitmap_blk);
>>> 		return NULL;
>>> 	}
>>> +
>>> +verify:
>>> +	ext4_lock_group(sb, block_group);
>>> +	if (!buffer_verified(bh) &&
>>> +	    !ext4_bitmap_csum_verify(sb, block_group,
>>> +				     desc->bg_inode_bitmap_csum, bh,
>>> +				     (EXT4_INODES_PER_GROUP(sb) + 7) / 8)) {
>>> +		ext4_unlock_group(sb, block_group);
>>> +		put_bh(bh);
>>> +		ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
>>> +			   "inode_bitmap = %llu", block_group, bitmap_blk);
>>> +		return NULL;
>> 
>> At some point we should add a flag like EXT4_BG_INODE_ERROR so that the
>> group can be marked in error on disk, and skipped for future allocations,
>> but the whole filesystem does not need to be remounted read-only.  That's
>> for another patch, however.
> 
> Agreed. :)
> 
> --D
> 
>>> +	}
>>> +	ext4_unlock_group(sb, block_group);
>>> +	set_buffer_verified(bh);
>>> 	return bh;
>>> }
>>> 
>>> @@ -265,6 +287,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
>>> 		ext4_used_dirs_set(sb, gdp, count);
>>> 		percpu_counter_dec(&sbi->s_dirs_counter);
>>> 	}
>>> +	ext4_bitmap_csum_set(sb, block_group, &gdp->bg_inode_bitmap_csum,
>>> +			     bitmap_bh, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
>>> 	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
>>> 	ext4_unlock_group(sb, block_group);
>>> 
>>> @@ -784,6 +808,9 @@ static int ext4_claim_inode(struct super_block *sb,
>>> 			atomic_inc(&sbi->s_flex_groups[f].used_dirs);
>>> 		}
>>> 	}
>>> +	ext4_bitmap_csum_set(sb, group, &gdp->bg_inode_bitmap_csum,
>>> +			     inode_bitmap_bh,
>>> +			     (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
>>> 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
>>> err_ret:
>>> 	ext4_unlock_group(sb, group);
>>> 
>> 
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Sept. 5, 2011, 6:22 p.m. UTC | #4
On Fri, Sep 02, 2011 at 03:27:21PM -0600, Andreas Dilger wrote:
> On 2011-09-02, at 1:18 PM, Darrick J. Wong wrote:
> > On Wed, Aug 31, 2011 at 10:49:05PM -0600, Andreas Dilger wrote:
> >> On 2011-08-31, at 6:31 PM, Darrick J. Wong wrote:
> >>> Compute and verify the checksum of the inode bitmap; the checkum is stored in
> >>> the block group descriptor.
> >> 
> >> I would prefer if there was a 16-bit checksum for the (most common)
> >> 32-byte group descriptors, and this was extended to a 32-bit checksum
> >> for the (much less common) 64-byte+ group descriptors.  For filesystems
> >> that are newly formatted with the 64bit feature it makes no difference,
> >> but virtually all ext3/4 filesystems have only the smaller group descriptors.
> >> 
> >> Regardless of whether using half of the crc32c is better or worse than
> >> using crc16 for the bitmap blocks, storing _any_ checksum is better than
> >> storing nothing at all.  I would propose the following:
> > 
> > That's an interesting reframing of the argument that I hadn't considered.
> > I'd fallen into the idea of needing crc32c because of its bit error
> > guarantees (all corruptions of odd numbers of bits and all corruptions of
> > fewer than ...4? bits) that I hadn't quite realized that even if crc16
> > can't guarantee to find any corruption at all, it still _might_, and that's
> > better than nothing.
> > 
> > Ok, let's split the 32-bit fields and use crc16 for the case of 32-byte block
> > group descriptors.
> 
> I noticed the crc16 calculation is actually _slower_ than crc32c,
> probably because the CPU cannot use 32-bit values when computing the
> result, so it has to do a lot of word masking, per your table at
> https://ext4.wiki.kernel.org/index.php/Ext4_Metadata_Checksums.
> Also, there is the question of whether computing two different
> checksums is needlessly complicating the code, or if it is easier
> to just compute crc32c all the time and only make the storing of
> the high 16 bits conditional.
> 
> What I'm suggesting is always computing the crc32c, but for filesystems
> that are not formatted with the 64bit option just store the low 16 bits
> of the crc32c value into bg_{block,inode}_bitmap_csum_lo.  This is much
> better than not computing a checksum here at all.  The only open question
> is whether 1/2 of crc32c is substantially worse at detecting errors than
> crc16 or not?

All the literature I've read has suggested that crc16 can't guarantee any error
detection capability at all with data buffers longer than 256 bytes.  So far in
my simulations I haven't seen that truncated-crc32c is particularly worse than
crc16, though they both seem to miss a lot of errors that crc32c would catch.
I guess we might as well use the fast one, it'll at least make the code
cleaner.

> I was also thinking whether the EXT4_FEATURE_RO_COMPAT_METADATA_CSUM
> feature should also cause the bg_checksum to do the same (store only
> low 16 bits of crc32c) just for the improved speed?

That mostly depends on how much overhead cryptoapi has over crc16 for small
blob sizes.

Or, if we imagine that bg descriptors might some day grow beyond 256 bytes,
maybe we allocate an extra 16 bits to store the entire crc32c?  This isn't as
clear-cut as inodes where a user can specify a huge size at mkfs time.

Perhaps we ought to declare a "CRC type" field in the sb (where 0 = crc16 and 1
= crc32c) just in case there's ever a desire to change the checksum algorithm?

...or, for the bg tables, we could (ab)use the structure definition a bit.
crc32c the entire block, and store the low/high 16-bits of the checksum in the
first and second descriptors' checksum fields.  This of course would result in
a loss of granularity (128 bgs go bad at once instead of just 1), so I don't
know if it's really needed for a small structure that can quadruple in size
before we need a longer CRC (and hasn't grown much in 15 years).

> It might be interesting to redo the table that you computed, but
> using a loop that is only computing the checksums for small blocks
> of data (e.g. 32 bytes and 4096 bytes in a loop for a total of 512MB)
> to see what the overhead of the cryptoapi and hardware calls are.

Yep.

--D

> >> struct ext4_group_desc
> >> {
> >>        __le32 bg_block_bitmap_lo;	/* Blocks bitmap block */
> >>        __le32 bg_inode_bitmap_lo;	/* Inodes bitmap block */
> >>        __le32 bg_inode_table_lo;	/* Inodes table block */
> >>        __le16 bg_free_blocks_count_lo;	/* Free blocks count */
> >>        __le16 bg_free_inodes_count_lo;	/* Free inodes count */
> >>        __le16 bg_used_dirs_count_lo;	/* Directories count */
> >>        __le16 bg_flags;		/* EXT4_BG_flags (INODE_UNINIT, etc) */
> >>        __le32 bg_exclude_bitmap_lo;	/* Exclude bitmap block */
> >>        __le16 bg_block_bitmap_csum_lo;	/* Block bitmap checksum */
> >> 	__le16 bg_inode_bitmap_csum_lo;	/* Inode bitmap checksum */
> >>        __le16 bg_itable_unused_lo;	/* Unused inodes count */
> >>        __le16 bg_checksum;		/* crc16(sb_uuid+group+desc) */
> >>        __le32 bg_block_bitmap_hi;	/* Blocks bitmap block MSB */
> >>        __le32 bg_inode_bitmap_hi;	/* Inodes bitmap block MSB */
> >>        __le32 bg_inode_table_hi;	/* Inodes table block MSB */
> >>        __le16 bg_free_blocks_count_hi;	/* Free blocks count MSB */
> >>        __le16 bg_free_inodes_count_hi;	/* Free inodes count MSB */
> >>        __le16 bg_used_dirs_count_hi;	/* Directories count MSB */
> >>        __le16 bg_itable_unused_hi;	/* Unused inodes count MSB */
> >> 	__le32 bg_exclude_bitmap_hi;	/* Exclude bitmap block MSB */
> >> 	__le16 bg_block_bitmap_csum_hi;	/* Blocks bitmap checksum MSB */
> >> 	__le16 bg_inode_bitmap_csum_hi;	/* Inodes bitmap checksum MSB */
> >>        __le32 bg_reserved2;
> >> };
> >> 
> >> This is also different from your layout because it locates the block bitmap
> >> checksum field before the inode bitmap checksum, to more closely match the
> >> order of other fields in this structure.
> > 
> > Er.. I reversed the order in the structure definition just prior to publishing,
> > and forgot to update the wiki page.  Well I guess I'm about to update it again.
> > :)
> > 
> >>> /*
> >>> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> >>> index 9c63f27..53faffc 100644
> >>> --- a/fs/ext4/ialloc.c
> >>> +++ b/fs/ext4/ialloc.c
> >>> @@ -82,12 +82,18 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
> >>> 		ext4_free_inodes_set(sb, gdp, 0);
> >>> 		ext4_itable_unused_set(sb, gdp, 0);
> >>> 		memset(bh->b_data, 0xff, sb->s_blocksize);
> >>> +		ext4_bitmap_csum_set(sb, block_group,
> >>> +				     &gdp->bg_inode_bitmap_csum, bh,
> >>> +				     (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
> >> 
> >> The number of inodes per group is already always a multiple of 8.
> > 
> > Ok.  I suppose we can fix that in the lines below too.
> > 
> >>> 		return 0;
> >>> 	}
> >>> 
> >>> 	memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
> >>> 	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
> >>> 			bh->b_data);
> >>> +	ext4_bitmap_csum_set(sb, block_group, &gdp->bg_inode_bitmap_csum, bh,
> >>> +			     (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
> >>> +	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
> >>> 
> >>> 	return EXT4_INODES_PER_GROUP(sb);
> >>> }
> >>> @@ -118,12 +124,12 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
> >>> 		return NULL;
> >>> 	}
> >>> 	if (bitmap_uptodate(bh))
> >>> -		return bh;
> >>> +		goto verify;
> >>> 
> >>> 	lock_buffer(bh);
> >>> 	if (bitmap_uptodate(bh)) {
> >>> 		unlock_buffer(bh);
> >>> -		return bh;
> >>> +		goto verify;
> >>> 	}
> >>> 
> >>> 	ext4_lock_group(sb, block_group);
> >>> @@ -131,6 +137,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
> >>> 		ext4_init_inode_bitmap(sb, bh, block_group, desc);
> >>> 		set_bitmap_uptodate(bh);
> >>> 		set_buffer_uptodate(bh);
> >>> +		set_buffer_verified(bh);
> >>> 		ext4_unlock_group(sb, block_group);
> >>> 		unlock_buffer(bh);
> >>> 		return bh;
> >>> @@ -144,7 +151,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
> >>> 		 */
> >>> 		set_bitmap_uptodate(bh);
> >>> 		unlock_buffer(bh);
> >>> -		return bh;
> >>> +		goto verify;
> >>> 	}
> >>> 	/*
> >>> 	 * submit the buffer_head for read. We can
> >>> @@ -161,6 +168,21 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
> >>> 			    block_group, bitmap_blk);
> >>> 		return NULL;
> >>> 	}
> >>> +
> >>> +verify:
> >>> +	ext4_lock_group(sb, block_group);
> >>> +	if (!buffer_verified(bh) &&
> >>> +	    !ext4_bitmap_csum_verify(sb, block_group,
> >>> +				     desc->bg_inode_bitmap_csum, bh,
> >>> +				     (EXT4_INODES_PER_GROUP(sb) + 7) / 8)) {
> >>> +		ext4_unlock_group(sb, block_group);
> >>> +		put_bh(bh);
> >>> +		ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
> >>> +			   "inode_bitmap = %llu", block_group, bitmap_blk);
> >>> +		return NULL;
> >> 
> >> At some point we should add a flag like EXT4_BG_INODE_ERROR so that the
> >> group can be marked in error on disk, and skipped for future allocations,
> >> but the whole filesystem does not need to be remounted read-only.  That's
> >> for another patch, however.
> > 
> > Agreed. :)
> > 
> > --D
> > 
> >>> +	}
> >>> +	ext4_unlock_group(sb, block_group);
> >>> +	set_buffer_verified(bh);
> >>> 	return bh;
> >>> }
> >>> 
> >>> @@ -265,6 +287,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
> >>> 		ext4_used_dirs_set(sb, gdp, count);
> >>> 		percpu_counter_dec(&sbi->s_dirs_counter);
> >>> 	}
> >>> +	ext4_bitmap_csum_set(sb, block_group, &gdp->bg_inode_bitmap_csum,
> >>> +			     bitmap_bh, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
> >>> 	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
> >>> 	ext4_unlock_group(sb, block_group);
> >>> 
> >>> @@ -784,6 +808,9 @@ static int ext4_claim_inode(struct super_block *sb,
> >>> 			atomic_inc(&sbi->s_flex_groups[f].used_dirs);
> >>> 		}
> >>> 	}
> >>> +	ext4_bitmap_csum_set(sb, group, &gdp->bg_inode_bitmap_csum,
> >>> +			     inode_bitmap_bh,
> >>> +			     (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
> >>> 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
> >>> err_ret:
> >>> 	ext4_unlock_group(sb, group);
> >>> 
> >> 
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andi Kleen Sept. 5, 2011, 6:27 p.m. UTC | #5
One very interesting optimization would be to profile the metadata workload
where you got the slowdown and try to figure out which fields
were most commonly updated.

If it's some superblock fields or similar maybe those could get their separate
crcs that could be separatedly computed at much lower cost.

-Andi
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Sept. 5, 2011, 7:45 p.m. UTC | #6
On Mon, 2011-09-05 at 11:22 -0700, Darrick J. Wong wrote:
> On Fri, Sep 02, 2011 at 03:27:21PM -0600, Andreas Dilger wrote:
> > On 2011-09-02, at 1:18 PM, Darrick J. Wong wrote:
> > > On Wed, Aug 31, 2011 at 10:49:05PM -0600, Andreas Dilger wrote:
> > >> On 2011-08-31, at 6:31 PM, Darrick J. Wong wrote:
> > >>> Compute and verify the checksum of the inode bitmap; the checkum is stored in
> > >>> the block group descriptor.
> > >> 
> > >> I would prefer if there was a 16-bit checksum for the (most common)
> > >> 32-byte group descriptors, and this was extended to a 32-bit checksum
> > >> for the (much less common) 64-byte+ group descriptors.  For filesystems
> > >> that are newly formatted with the 64bit feature it makes no difference,
> > >> but virtually all ext3/4 filesystems have only the smaller group descriptors.
> > >> 
> > >> Regardless of whether using half of the crc32c is better or worse than
> > >> using crc16 for the bitmap blocks, storing _any_ checksum is better than
> > >> storing nothing at all.  I would propose the following:
> > > 
> > > That's an interesting reframing of the argument that I hadn't considered.
> > > I'd fallen into the idea of needing crc32c because of its bit error
> > > guarantees (all corruptions of odd numbers of bits and all corruptions of
> > > fewer than ...4? bits) that I hadn't quite realized that even if crc16
> > > can't guarantee to find any corruption at all, it still _might_, and that's
> > > better than nothing.
> > > 
> > > Ok, let's split the 32-bit fields and use crc16 for the case of 32-byte block
> > > group descriptors.
> > 
> > I noticed the crc16 calculation is actually _slower_ than crc32c,
> > probably because the CPU cannot use 32-bit values when computing the
> > result, so it has to do a lot of word masking, per your table at
> > https://ext4.wiki.kernel.org/index.php/Ext4_Metadata_Checksums.
> > Also, there is the question of whether computing two different
> > checksums is needlessly complicating the code, or if it is easier
> > to just compute crc32c all the time and only make the storing of
> > the high 16 bits conditional.
> > 
> > What I'm suggesting is always computing the crc32c, but for filesystems
> > that are not formatted with the 64bit option just store the low 16 bits
> > of the crc32c value into bg_{block,inode}_bitmap_csum_lo.  This is much
> > better than not computing a checksum here at all.  The only open question
> > is whether 1/2 of crc32c is substantially worse at detecting errors than
> > crc16 or not?
> 
> All the literature I've read has suggested that crc16 can't guarantee any error
> detection capability at all with data buffers longer than 256 bytes.

Um, so in a hashing algorithm that maps f:Z_m -> Z_n you can never
guarantee error detection if m>n because of hash collisions.  All you
can guarantee is that if f(a) != f(b) then a != b, so crc16 wouldn't be
able to *guarantee* error detection in anything over 2 bytes.

All of the rest of the magic in hashing functions goes into making sure
that the collision sets don't include common errors (like bit flipping).
In theory, for the correct polynomial, CRC-16 should be able to detect
single, double and triple bit flip errors in blocks of up to 8191
bytes ... of course, if those aren't your common errors, then this
analysis is useless ...

James


--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Sept. 5, 2011, 10:12 p.m. UTC | #7
On Mon, Sep 05, 2011 at 02:45:28PM -0500, James Bottomley wrote:
> On Mon, 2011-09-05 at 11:22 -0700, Darrick J. Wong wrote:
> > On Fri, Sep 02, 2011 at 03:27:21PM -0600, Andreas Dilger wrote:
> > > On 2011-09-02, at 1:18 PM, Darrick J. Wong wrote:
> > > > On Wed, Aug 31, 2011 at 10:49:05PM -0600, Andreas Dilger wrote:
> > > >> On 2011-08-31, at 6:31 PM, Darrick J. Wong wrote:
> > > >>> Compute and verify the checksum of the inode bitmap; the checkum is stored in
> > > >>> the block group descriptor.
> > > >> 
> > > >> I would prefer if there was a 16-bit checksum for the (most common)
> > > >> 32-byte group descriptors, and this was extended to a 32-bit checksum
> > > >> for the (much less common) 64-byte+ group descriptors.  For filesystems
> > > >> that are newly formatted with the 64bit feature it makes no difference,
> > > >> but virtually all ext3/4 filesystems have only the smaller group descriptors.
> > > >> 
> > > >> Regardless of whether using half of the crc32c is better or worse than
> > > >> using crc16 for the bitmap blocks, storing _any_ checksum is better than
> > > >> storing nothing at all.  I would propose the following:
> > > > 
> > > > That's an interesting reframing of the argument that I hadn't considered.
> > > > I'd fallen into the idea of needing crc32c because of its bit error
> > > > guarantees (all corruptions of odd numbers of bits and all corruptions of
> > > > fewer than ...4? bits) that I hadn't quite realized that even if crc16
> > > > can't guarantee to find any corruption at all, it still _might_, and that's
> > > > better than nothing.
> > > > 
> > > > Ok, let's split the 32-bit fields and use crc16 for the case of 32-byte block
> > > > group descriptors.
> > > 
> > > I noticed the crc16 calculation is actually _slower_ than crc32c,
> > > probably because the CPU cannot use 32-bit values when computing the
> > > result, so it has to do a lot of word masking, per your table at
> > > https://ext4.wiki.kernel.org/index.php/Ext4_Metadata_Checksums.
> > > Also, there is the question of whether computing two different
> > > checksums is needlessly complicating the code, or if it is easier
> > > to just compute crc32c all the time and only make the storing of
> > > the high 16 bits conditional.
> > > 
> > > What I'm suggesting is always computing the crc32c, but for filesystems
> > > that are not formatted with the 64bit option just store the low 16 bits
> > > of the crc32c value into bg_{block,inode}_bitmap_csum_lo.  This is much
> > > better than not computing a checksum here at all.  The only open question
> > > is whether 1/2 of crc32c is substantially worse at detecting errors than
> > > crc16 or not?
> > 
> > All the literature I've read has suggested that crc16 can't guarantee any error
> > detection capability at all with data buffers longer than 256 bytes.
> 
> Um, so in a hashing algorithm that maps f:Z_m -> Z_n you can never
> guarantee error detection if m>n because of hash collisions.  All you
> can guarantee is that if f(a) != f(b) then a != b, so crc16 wouldn't be
> able to *guarantee* error detection in anything over 2 bytes.
> 
> All of the rest of the magic in hashing functions goes into making sure
> that the collision sets don't include common errors (like bit flipping).
> In theory, for the correct polynomial, CRC-16 should be able to detect
> single, double and triple bit flip errors in blocks of up to 8191
> bytes ... of course, if those aren't your common errors, then this
> analysis is useless ...

Sorry, I grossly misspoke.  Of course crc16 can't guarantee the ability to
detect all possible errors in any data block larger than 16 bits.  What I meant
to say is that I wasn't sure what is the maximum number of bit errors that
crc16 polynomials can detect given a message length of 32768+32+128 bits.

In particular, I remember reading on the wikipedia page[1] that for polynomials
with odd numbers of terms (such as ansi crc16), the period for 2-bit errors is
65535 bits as you say; but as I recall, those polynomials also can't detect all
errors involving odd numbers of bit flips.  For polynomials with even numbers
of terms (such as the t10dif one) the period in which it can detect 2-bit
errors is 32767 bits, but on the other hand they can detect odd numbers of
errors.

The people who study error rates on disk hardware at IBM tell me that bit flips
are more common than you'd like, though I was also looking for something that
can tell me if blocks are being written to the wrong places.

[1] http://en.wikipedia.org/wiki/Mathematics_of_CRC#Bitfilters

--D
> 
> James
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bc7ace1..248cbd2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -279,7 +279,8 @@  struct ext4_group_desc
 	__le16	bg_free_inodes_count_hi;/* Free inodes count MSB */
 	__le16	bg_used_dirs_count_hi;	/* Directories count MSB */
 	__le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
-	__u32	bg_reserved2[3];
+	__le32	bg_inode_bitmap_csum;	/* crc32c(uuid+group+ibitmap) */
+	__u32	bg_reserved2[2];
 };
 
 /*
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 9c63f27..53faffc 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -82,12 +82,18 @@  static unsigned ext4_init_inode_bitmap(struct super_block *sb,
 		ext4_free_inodes_set(sb, gdp, 0);
 		ext4_itable_unused_set(sb, gdp, 0);
 		memset(bh->b_data, 0xff, sb->s_blocksize);
+		ext4_bitmap_csum_set(sb, block_group,
+				     &gdp->bg_inode_bitmap_csum, bh,
+				     (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
 		return 0;
 	}
 
 	memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
 	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
 			bh->b_data);
+	ext4_bitmap_csum_set(sb, block_group, &gdp->bg_inode_bitmap_csum, bh,
+			     (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
+	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
 
 	return EXT4_INODES_PER_GROUP(sb);
 }
@@ -118,12 +124,12 @@  ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 		return NULL;
 	}
 	if (bitmap_uptodate(bh))
-		return bh;
+		goto verify;
 
 	lock_buffer(bh);
 	if (bitmap_uptodate(bh)) {
 		unlock_buffer(bh);
-		return bh;
+		goto verify;
 	}
 
 	ext4_lock_group(sb, block_group);
@@ -131,6 +137,7 @@  ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 		ext4_init_inode_bitmap(sb, bh, block_group, desc);
 		set_bitmap_uptodate(bh);
 		set_buffer_uptodate(bh);
+		set_buffer_verified(bh);
 		ext4_unlock_group(sb, block_group);
 		unlock_buffer(bh);
 		return bh;
@@ -144,7 +151,7 @@  ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 		 */
 		set_bitmap_uptodate(bh);
 		unlock_buffer(bh);
-		return bh;
+		goto verify;
 	}
 	/*
 	 * submit the buffer_head for read. We can
@@ -161,6 +168,21 @@  ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 			    block_group, bitmap_blk);
 		return NULL;
 	}
+
+verify:
+	ext4_lock_group(sb, block_group);
+	if (!buffer_verified(bh) &&
+	    !ext4_bitmap_csum_verify(sb, block_group,
+				     desc->bg_inode_bitmap_csum, bh,
+				     (EXT4_INODES_PER_GROUP(sb) + 7) / 8)) {
+		ext4_unlock_group(sb, block_group);
+		put_bh(bh);
+		ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
+			   "inode_bitmap = %llu", block_group, bitmap_blk);
+		return NULL;
+	}
+	ext4_unlock_group(sb, block_group);
+	set_buffer_verified(bh);
 	return bh;
 }
 
@@ -265,6 +287,8 @@  void ext4_free_inode(handle_t *handle, struct inode *inode)
 		ext4_used_dirs_set(sb, gdp, count);
 		percpu_counter_dec(&sbi->s_dirs_counter);
 	}
+	ext4_bitmap_csum_set(sb, block_group, &gdp->bg_inode_bitmap_csum,
+			     bitmap_bh, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
 	ext4_unlock_group(sb, block_group);
 
@@ -784,6 +808,9 @@  static int ext4_claim_inode(struct super_block *sb,
 			atomic_inc(&sbi->s_flex_groups[f].used_dirs);
 		}
 	}
+	ext4_bitmap_csum_set(sb, group, &gdp->bg_inode_bitmap_csum,
+			     inode_bitmap_bh,
+			     (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
 	ext4_unlock_group(sb, group);