Patchwork [10/37] mke2fs: Allow metadata checksums to be turned on at mkfs time

login
register
mail settings
Submitter Darrick J. Wong
Date Sept. 1, 2011, 12:36 a.m.
Message ID <20110901003615.1176.76957.stgit@elm3c44.beaverton.ibm.com>
Download mbox | patch
Permalink /patch/112759/
State Changes Requested
Headers show

Comments

Darrick J. Wong - Sept. 1, 2011, 12:36 a.m.
Write out checksummed inodes even when writing out a zeroed table.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
---
 misc/mke2fs.c |   37 ++++++++++++++++++++++++++++++-------
 1 files changed, 30 insertions(+), 7 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger - Sept. 4, 2011, 6:28 p.m.
On 2011-08-31, at 6:36 PM, "Darrick J. Wong" <djwong@us.ibm.com> wrote:
> Write out checksummed inodes even when writing out a zeroed table.
> 
> Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
> ---
> misc/mke2fs.c |   37 ++++++++++++++++++++++++++++++-------
> 1 files changed, 30 insertions(+), 7 deletions(-)
> 
> 
> diff --git a/misc/mke2fs.c b/misc/mke2fs.c
> index 2d57d09..bbc0533 100644
> --- a/misc/mke2fs.c
> +++ b/misc/mke2fs.c
> @@ -309,6 +309,8 @@ static void write_inode_tables(ext2_filsys fs, int lazy_flag, int itable_zeroed)
>    dgrp_t        i;
>    int        num;
>    struct ext2fs_numeric_progress_struct progress;
> +    ext2_ino_t    ino;
> +    struct ext2_inode_large inode;
> 
>    ext2fs_numeric_progress_init(fs, &progress,
>                     _("Writing inode tables: "),
> @@ -330,12 +332,32 @@ static void write_inode_tables(ext2_filsys fs, int lazy_flag, int itable_zeroed)
>            ext2fs_bg_flags_set(fs, i, EXT2_BG_INODE_ZEROED);
>            ext2fs_group_desc_csum_set(fs, i);
>        }
> -        retval = ext2fs_zero_blocks2(fs, blk, num, &blk, &num);
> -        if (retval) {
> -            fprintf(stderr, _("\nCould not write %d "
> -                  "blocks in inode table starting at %llu: %s\n"),
> -                num, blk, error_message(retval));
> -            exit(1);
> +        if (fs->super->s_creator_os == EXT2_OS_LINUX &&
> +            fs->super->s_feature_ro_compat &
> +            EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) {

Somehow it doesn't look like this is skipping the zeroing of the inode table blocks if lazy itable zeroing is set. 

Any measurements on how much this slows down inode table writing (which is already the slowest part of mke2fs)?

> +            bzero(&inode, sizeof(inode));
> +            for (ino = fs->super->s_inodes_per_group * i;
> +                 ino < fs->super->s_inodes_per_group * (i + 1);
> +                 ino++) {

Why recompute "ino" each time through this loop?  It should be enough to simply initialize it at 1 and then increment it for each inode written. 

> +                if (!ino)
> +                    continue;
> +                retval = ext2fs_write_inode(fs, ino, &inode);
> +                if (retval) {
> +                    com_err("inode_init", retval,
> +                        "while writing inode %d\n",
> +                        ino);
> +                    exit(1);
> +                }
> +            }
> +        } else {
> +            retval = ext2fs_zero_blocks2(fs, blk, num, &blk, &num);
> +            if (retval) {
> +                fprintf(stderr, _("\nCould not write %d "
> +                    "blocks in inode table starting "
> +                    "at %llu: %s\n"),
> +                    num, blk, error_message(retval));
> +                exit(1);
> +            }
>        }
>        if (sync_kludge) {
>            if (sync_kludge == 1)
> @@ -829,7 +851,8 @@ static __u32 ok_features[3] = {
>        EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE|
>        EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER|
>        EXT4_FEATURE_RO_COMPAT_GDT_CSUM|
> -        EXT4_FEATURE_RO_COMPAT_BIGALLOC
> +        EXT4_FEATURE_RO_COMPAT_BIGALLOC|
> +        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM
> };
> 
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong - Sept. 5, 2011, 7:20 p.m.
On Sun, Sep 04, 2011 at 12:28:24PM -0600, Andreas Dilger wrote:
> On 2011-08-31, at 6:36 PM, "Darrick J. Wong" <djwong@us.ibm.com> wrote:
> > Write out checksummed inodes even when writing out a zeroed table.
> > 
> > Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
> > ---
> > misc/mke2fs.c |   37 ++++++++++++++++++++++++++++++-------
> > 1 files changed, 30 insertions(+), 7 deletions(-)
> > 
> > 
> > diff --git a/misc/mke2fs.c b/misc/mke2fs.c
> > index 2d57d09..bbc0533 100644
> > --- a/misc/mke2fs.c
> > +++ b/misc/mke2fs.c
> > @@ -309,6 +309,8 @@ static void write_inode_tables(ext2_filsys fs, int lazy_flag, int itable_zeroed)
> >    dgrp_t        i;
> >    int        num;
> >    struct ext2fs_numeric_progress_struct progress;
> > +    ext2_ino_t    ino;
> > +    struct ext2_inode_large inode;
> > 
> >    ext2fs_numeric_progress_init(fs, &progress,
> >                     _("Writing inode tables: "),
> > @@ -330,12 +332,32 @@ static void write_inode_tables(ext2_filsys fs, int lazy_flag, int itable_zeroed)
> >            ext2fs_bg_flags_set(fs, i, EXT2_BG_INODE_ZEROED);
> >            ext2fs_group_desc_csum_set(fs, i);
> >        }
> > -        retval = ext2fs_zero_blocks2(fs, blk, num, &blk, &num);
> > -        if (retval) {
> > -            fprintf(stderr, _("\nCould not write %d "
> > -                  "blocks in inode table starting at %llu: %s\n"),
> > -                num, blk, error_message(retval));
> > -            exit(1);
> > +        if (fs->super->s_creator_os == EXT2_OS_LINUX &&
> > +            fs->super->s_feature_ro_compat &
> > +            EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) {
> 
> Somehow it doesn't look like this is skipping the zeroing of the inode table
> blocks if lazy itable zeroing is set. 
> 
> Any measurements on how much this slows down inode table writing (which is
> already the slowest part of mke2fs)?

Quite a lot, actually.  Trouble is, if you're going to write zeroes to the
inode table (without using uninit) then I think you need the checksums to
match.  Maybe the solution is to modify the kernel/e2fsck to ignore the
checksum if the inode bitmap says the inode isn't in use?

A better solution is to zero the buffer, stuff in all the checksums in the
correct places, and then write the block out.

> > +            bzero(&inode, sizeof(inode));
> > +            for (ino = fs->super->s_inodes_per_group * i;
> > +                 ino < fs->super->s_inodes_per_group * (i + 1);
> > +                 ino++) {
> 
> Why recompute "ino" each time through this loop?  It should be enough to
> simply initialize it at 1 and then increment it for each inode written. 

Agreed.

--D
> > +                if (!ino)
> > +                    continue;
> > +                retval = ext2fs_write_inode(fs, ino, &inode);
> > +                if (retval) {
> > +                    com_err("inode_init", retval,
> > +                        "while writing inode %d\n",
> > +                        ino);
> > +                    exit(1);
> > +                }
> > +            }
> > +        } else {
> > +            retval = ext2fs_zero_blocks2(fs, blk, num, &blk, &num);
> > +            if (retval) {
> > +                fprintf(stderr, _("\nCould not write %d "
> > +                    "blocks in inode table starting "
> > +                    "at %llu: %s\n"),
> > +                    num, blk, error_message(retval));
> > +                exit(1);
> > +            }
> >        }
> >        if (sync_kludge) {
> >            if (sync_kludge == 1)
> > @@ -829,7 +851,8 @@ static __u32 ok_features[3] = {
> >        EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE|
> >        EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER|
> >        EXT4_FEATURE_RO_COMPAT_GDT_CSUM|
> > -        EXT4_FEATURE_RO_COMPAT_BIGALLOC
> > +        EXT4_FEATURE_RO_COMPAT_BIGALLOC|
> > +        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM
> > };
> > 
> > 
> > 
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger - Sept. 6, 2011, 1:54 a.m.
On 2011-09-05, at 1:20 PM, "Darrick J. Wong" <djwong@us.ibm.com> wrote:
> On Sun, Sep 04, 2011 at 12:28:24PM -0600, Andreas Dilger wrote:
>> On 2011-08-31, at 6:36 PM, "Darrick J. Wong" <djwong@us.ibm.com> wrote:
>>> Write out checksummed inodes even when writing out a zeroed table.
>>> 
>>> +        if (fs->super->s_creator_os == EXT2_OS_LINUX &&
>>> +            fs->super->s_feature_ro_compat &
>>> +            EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) {
>> 
>> Somehow it doesn't look like this is skipping the zeroing of the inode table
>> blocks if lazy itable zeroing is set. 
>> 
>> Any measurements on how much this slows down inode table writing (which is
>> already the slowest part of mke2fs)?
> 
> Quite a lot, actually.  Trouble is, if you're going to write zeroes to the
> inode table (without using uninit) then I think you need the checksums to
> match.  Maybe the solution is to modify the kernel/e2fsck to ignore the
> checksum if the inode bitmap says the inode isn't in use?

The kernel is already aware of which inodes are not in use if the uninit_bg feature is enabled. Even without uninit_bg, the kernel will not read itable blocks from disk if none of the inodes in that block are used. 

Also, if the lazy_itable_init is passed to mke2fs it isn't supposed to initialize the inode table at all, and the kernel should do it instead. 

> A better solution is to zero the buffer, stuff in all the checksums in the
> correct places, and then write the block out.

Rather, the kernel should do it in the background. 

>>> +            bzero(&inode, sizeof(inode));
>>> +            for (ino = fs->super->s_inodes_per_group * i;
>>> +                 ino < fs->super->s_inodes_per_group * (i + 1);
>>> +                 ino++) {
>> 
>> Why recompute "ino" each time through this loop?  It should be enough to
>> simply initialize it at 1 and then increment it for each inode written. 
> 
> Agreed.
> 
> --D
>>> +                if (!ino)
>>> +                    continue;
>>> +                retval = ext2fs_write_inode(fs, ino, &inode);
>>> +                if (retval) {
>>> +                    com_err("inode_init", retval,
>>> +                        "while writing inode %d\n",
>>> +                        ino);
>>> +                    exit(1);
>>> +                }
>>> +            }
>>> +        } else {
>>> +            retval = ext2fs_zero_blocks2(fs, blk, num, &blk, &num);
>>> +            if (retval) {
>>> +                fprintf(stderr, _("\nCould not write %d "
>>> +                    "blocks in inode table starting "
>>> +                    "at %llu: %s\n"),
>>> +                    num, blk, error_message(retval));
>>> +                exit(1);
>>> +            }
>>>       }
>>>       if (sync_kludge) {
>>>           if (sync_kludge == 1)
>>> @@ -829,7 +851,8 @@ static __u32 ok_features[3] = {
>>>       EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE|
>>>       EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER|
>>>       EXT4_FEATURE_RO_COMPAT_GDT_CSUM|
>>> -        EXT4_FEATURE_RO_COMPAT_BIGALLOC
>>> +        EXT4_FEATURE_RO_COMPAT_BIGALLOC|
>>> +        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM
>>> };
>>> 
>>> 
>>> 
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong - Sept. 6, 2011, 5:13 p.m.
On Mon, Sep 05, 2011 at 07:54:32PM -0600, Andreas Dilger wrote:
> On 2011-09-05, at 1:20 PM, "Darrick J. Wong" <djwong@us.ibm.com> wrote:
> > On Sun, Sep 04, 2011 at 12:28:24PM -0600, Andreas Dilger wrote:
> >> On 2011-08-31, at 6:36 PM, "Darrick J. Wong" <djwong@us.ibm.com> wrote:
> >>> Write out checksummed inodes even when writing out a zeroed table.
> >>> 
> >>> +        if (fs->super->s_creator_os == EXT2_OS_LINUX &&
> >>> +            fs->super->s_feature_ro_compat &
> >>> +            EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) {
> >> 
> >> Somehow it doesn't look like this is skipping the zeroing of the inode table
> >> blocks if lazy itable zeroing is set. 
> >> 
> >> Any measurements on how much this slows down inode table writing (which is
> >> already the slowest part of mke2fs)?
> > 
> > Quite a lot, actually.  Trouble is, if you're going to write zeroes to the
> > inode table (without using uninit) then I think you need the checksums to
> > match.  Maybe the solution is to modify the kernel/e2fsck to ignore the
> > checksum if the inode bitmap says the inode isn't in use?
> 
> The kernel is already aware of which inodes are not in use if the uninit_bg
> feature is enabled. Even without uninit_bg, the kernel will not read itable
> blocks from disk if none of the inodes in that block are used. 
> 
> Also, if the lazy_itable_init is passed to mke2fs it isn't supposed to
> initialize the inode table at all, and the kernel should do it instead. 

Ok.

> > A better solution is to zero the buffer, stuff in all the checksums in the
> > correct places, and then write the block out.
> 
> Rather, the kernel should do it in the background. 

Append "...If the kernel won't do it in the background." to my earlier
statement. :) There seems to be some code that probes around in sysfs to make
sure that the kernel can handle uninit_bg ...
/sys/fs/ext4/features/lazy_itable_init I think?

--D

> >>> +            bzero(&inode, sizeof(inode));
> >>> +            for (ino = fs->super->s_inodes_per_group * i;
> >>> +                 ino < fs->super->s_inodes_per_group * (i + 1);
> >>> +                 ino++) {
> >> 
> >> Why recompute "ino" each time through this loop?  It should be enough to
> >> simply initialize it at 1 and then increment it for each inode written. 
> > 
> > Agreed.
> > 
> > --D
> >>> +                if (!ino)
> >>> +                    continue;
> >>> +                retval = ext2fs_write_inode(fs, ino, &inode);
> >>> +                if (retval) {
> >>> +                    com_err("inode_init", retval,
> >>> +                        "while writing inode %d\n",
> >>> +                        ino);
> >>> +                    exit(1);
> >>> +                }
> >>> +            }
> >>> +        } else {
> >>> +            retval = ext2fs_zero_blocks2(fs, blk, num, &blk, &num);
> >>> +            if (retval) {
> >>> +                fprintf(stderr, _("\nCould not write %d "
> >>> +                    "blocks in inode table starting "
> >>> +                    "at %llu: %s\n"),
> >>> +                    num, blk, error_message(retval));
> >>> +                exit(1);
> >>> +            }
> >>>       }
> >>>       if (sync_kludge) {
> >>>           if (sync_kludge == 1)
> >>> @@ -829,7 +851,8 @@ static __u32 ok_features[3] = {
> >>>       EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE|
> >>>       EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER|
> >>>       EXT4_FEATURE_RO_COMPAT_GDT_CSUM|
> >>> -        EXT4_FEATURE_RO_COMPAT_BIGALLOC
> >>> +        EXT4_FEATURE_RO_COMPAT_BIGALLOC|
> >>> +        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM
> >>> };
> >>> 
> >>> 
> >>> 
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/misc/mke2fs.c b/misc/mke2fs.c
index 2d57d09..bbc0533 100644
--- a/misc/mke2fs.c
+++ b/misc/mke2fs.c
@@ -309,6 +309,8 @@  static void write_inode_tables(ext2_filsys fs, int lazy_flag, int itable_zeroed)
 	dgrp_t		i;
 	int		num;
 	struct ext2fs_numeric_progress_struct progress;
+	ext2_ino_t	ino;
+	struct ext2_inode_large inode;
 
 	ext2fs_numeric_progress_init(fs, &progress,
 				     _("Writing inode tables: "),
@@ -330,12 +332,32 @@  static void write_inode_tables(ext2_filsys fs, int lazy_flag, int itable_zeroed)
 			ext2fs_bg_flags_set(fs, i, EXT2_BG_INODE_ZEROED);
 			ext2fs_group_desc_csum_set(fs, i);
 		}
-		retval = ext2fs_zero_blocks2(fs, blk, num, &blk, &num);
-		if (retval) {
-			fprintf(stderr, _("\nCould not write %d "
-				  "blocks in inode table starting at %llu: %s\n"),
-				num, blk, error_message(retval));
-			exit(1);
+		if (fs->super->s_creator_os == EXT2_OS_LINUX &&
+		    fs->super->s_feature_ro_compat &
+		    EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) {
+			bzero(&inode, sizeof(inode));
+			for (ino = fs->super->s_inodes_per_group * i;
+			     ino < fs->super->s_inodes_per_group * (i + 1);
+			     ino++) {
+				if (!ino)
+					continue;
+				retval = ext2fs_write_inode(fs, ino, &inode);
+				if (retval) {
+					com_err("inode_init", retval,
+						"while writing inode %d\n",
+						ino);
+					exit(1);
+				}
+			}
+		} else {
+			retval = ext2fs_zero_blocks2(fs, blk, num, &blk, &num);
+			if (retval) {
+				fprintf(stderr, _("\nCould not write %d "
+					"blocks in inode table starting "
+					"at %llu: %s\n"),
+					num, blk, error_message(retval));
+				exit(1);
+			}
 		}
 		if (sync_kludge) {
 			if (sync_kludge == 1)
@@ -829,7 +851,8 @@  static __u32 ok_features[3] = {
 		EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE|
 		EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER|
 		EXT4_FEATURE_RO_COMPAT_GDT_CSUM|
-		EXT4_FEATURE_RO_COMPAT_BIGALLOC
+		EXT4_FEATURE_RO_COMPAT_BIGALLOC|
+		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM
 };