diff mbox

[2/2] resize2fs: fix overhead calculation for meta_bg file systems

Message ID 1346690758-21072-2-git-send-email-tytso@mit.edu
State Superseded, archived
Headers show

Commit Message

Theodore Ts'o Sept. 3, 2012, 4:45 p.m. UTC
The file system overhead calculation in calculate_minimum_resize_size
was incorrect meta_bg file systems.  This caused the minimum size to
underflow for very large file systems, which threw resize2fs into a
loop generally lasted longer than the user's patience.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 resize/resize2fs.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

Comments

Yongqiang Yang Sept. 4, 2012, 1:59 a.m. UTC | #1
Hi Kevin,

  Ted has sent out the patches on online resizing for meta_bg and
64bits, so you can have a try again. It seems that the bug in
e2fsprogs has been fixed.

Yongqiang.

On Tue, Sep 4, 2012 at 12:45 AM, Theodore Ts'o <tytso@mit.edu> wrote:
> The file system overhead calculation in calculate_minimum_resize_size
> was incorrect meta_bg file systems.  This caused the minimum size to
> underflow for very large file systems, which threw resize2fs into a
> loop generally lasted longer than the user's patience.
>
> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
> ---
>  resize/resize2fs.c | 25 ++++++++++++++++++++++++-
>  1 file changed, 24 insertions(+), 1 deletion(-)
>
> diff --git a/resize/resize2fs.c b/resize/resize2fs.c
> index dc2805d..1dce498 100644
> --- a/resize/resize2fs.c
> +++ b/resize/resize2fs.c
> @@ -1890,6 +1890,8 @@ blk64_t calculate_minimum_resize_size(ext2_filsys fs)
>         blk64_t grp, data_needed, last_start;
>         blk64_t overhead = 0;
>         int num_of_superblocks = 0;
> +       blk64_t super_overhead = 0;
> +       int old_desc_blocks;
>         int extra_groups = 0;
>         int flexbg_size = 1 << fs->super->s_log_groups_per_flex;
>
> @@ -1909,15 +1911,36 @@ blk64_t calculate_minimum_resize_size(ext2_filsys fs)
>          * we need to figure out how many backup superblocks we have so we can
>          * account for that in the metadata
>          */
> +       if (fs->super->s_feature_incompat & EXT2_FEATURE_INCOMPAT_META_BG)
> +               old_desc_blocks = fs->super->s_first_meta_bg;
> +       else
> +               old_desc_blocks = fs->desc_blocks +
> +                       fs->super->s_reserved_gdt_blocks;
> +
>         for (grp = 0; grp < fs->group_desc_count; grp++) {
> +               blk64_t super_blk, old_desc_blk, new_desc_blk;
> +               int has_super;
> +
> +               ext2fs_super_and_bgd_loc2(fs, grp, &super_blk,
> +                                         &old_desc_blk, &new_desc_blk, 0);
> +               has_super = ((grp == 0) || super_blk);
> +               if (has_super)
> +                       super_overhead++;
> +               if (old_desc_blk)
> +                       super_overhead += old_desc_blocks;
> +               else if (new_desc_blk)
> +                       super_overhead++;
>                 if (ext2fs_bg_has_super(fs, grp))
>                         num_of_superblocks++;
> +
>         }
> +       printf("super overhead is %llu, old algorithm was %llu\n",
> +              super_overhead, SUPER_OVERHEAD(fs) * num_of_superblocks);
>
>         /* calculate how many blocks are needed for data */
>         data_needed = ext2fs_blocks_count(fs->super) -
>                 ext2fs_free_blocks_count(fs->super);
> -       data_needed -= SUPER_OVERHEAD(fs) * num_of_superblocks;
> +       data_needed -= super_overhead;
>         data_needed -= META_OVERHEAD(fs) * fs->group_desc_count;
>
>         if (fs->super->s_feature_incompat & EXT4_FEATURE_INCOMPAT_FLEX_BG) {
> --
> 1.7.12.rc0.22.gcdd159b
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yongqiang Yang Sept. 4, 2012, 2:02 a.m. UTC | #2
Hi Anssi,

Ted has sent out the patches on online-resizing on meta_bg and 64bits,
please have a try!

Yongqiang.

On Tue, Sep 4, 2012 at 12:45 AM, Theodore Ts'o <tytso@mit.edu> wrote:
> The file system overhead calculation in calculate_minimum_resize_size
> was incorrect meta_bg file systems.  This caused the minimum size to
> underflow for very large file systems, which threw resize2fs into a
> loop generally lasted longer than the user's patience.
>
> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
> ---
>  resize/resize2fs.c | 25 ++++++++++++++++++++++++-
>  1 file changed, 24 insertions(+), 1 deletion(-)
>
> diff --git a/resize/resize2fs.c b/resize/resize2fs.c
> index dc2805d..1dce498 100644
> --- a/resize/resize2fs.c
> +++ b/resize/resize2fs.c
> @@ -1890,6 +1890,8 @@ blk64_t calculate_minimum_resize_size(ext2_filsys fs)
>         blk64_t grp, data_needed, last_start;
>         blk64_t overhead = 0;
>         int num_of_superblocks = 0;
> +       blk64_t super_overhead = 0;
> +       int old_desc_blocks;
>         int extra_groups = 0;
>         int flexbg_size = 1 << fs->super->s_log_groups_per_flex;
>
> @@ -1909,15 +1911,36 @@ blk64_t calculate_minimum_resize_size(ext2_filsys fs)
>          * we need to figure out how many backup superblocks we have so we can
>          * account for that in the metadata
>          */
> +       if (fs->super->s_feature_incompat & EXT2_FEATURE_INCOMPAT_META_BG)
> +               old_desc_blocks = fs->super->s_first_meta_bg;
> +       else
> +               old_desc_blocks = fs->desc_blocks +
> +                       fs->super->s_reserved_gdt_blocks;
> +
>         for (grp = 0; grp < fs->group_desc_count; grp++) {
> +               blk64_t super_blk, old_desc_blk, new_desc_blk;
> +               int has_super;
> +
> +               ext2fs_super_and_bgd_loc2(fs, grp, &super_blk,
> +                                         &old_desc_blk, &new_desc_blk, 0);
> +               has_super = ((grp == 0) || super_blk);
> +               if (has_super)
> +                       super_overhead++;
> +               if (old_desc_blk)
> +                       super_overhead += old_desc_blocks;
> +               else if (new_desc_blk)
> +                       super_overhead++;
>                 if (ext2fs_bg_has_super(fs, grp))
>                         num_of_superblocks++;
> +
>         }
> +       printf("super overhead is %llu, old algorithm was %llu\n",
> +              super_overhead, SUPER_OVERHEAD(fs) * num_of_superblocks);
>
>         /* calculate how many blocks are needed for data */
>         data_needed = ext2fs_blocks_count(fs->super) -
>                 ext2fs_free_blocks_count(fs->super);
> -       data_needed -= SUPER_OVERHEAD(fs) * num_of_superblocks;
> +       data_needed -= super_overhead;
>         data_needed -= META_OVERHEAD(fs) * fs->group_desc_count;
>
>         if (fs->super->s_feature_incompat & EXT4_FEATURE_INCOMPAT_FLEX_BG) {
> --
> 1.7.12.rc0.22.gcdd159b
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kevin Liao Sept. 4, 2012, 2:14 a.m. UTC | #3
2012/9/4 Yongqiang Yang <xiaoqiangnk@gmail.com>
>
> Hi Kevin,
>
>   Ted has sent out the patches on online resizing for meta_bg and
> 64bits, so you can have a try again. It seems that the bug in
> e2fsprogs has been fixed.
>
> Yongqiang.
>

Hi Ted & Yongqiang,

I will try to test the patch as soon as possible. Thanks a lot for your effort.

Regards,
Kevin Liao

> On Tue, Sep 4, 2012 at 12:45 AM, Theodore Ts'o <tytso@mit.edu> wrote:
> > The file system overhead calculation in calculate_minimum_resize_size
> > was incorrect meta_bg file systems.  This caused the minimum size to
> > underflow for very large file systems, which threw resize2fs into a
> > loop generally lasted longer than the user's patience.
> >
> > Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
> > ---
> >  resize/resize2fs.c | 25 ++++++++++++++++++++++++-
> >  1 file changed, 24 insertions(+), 1 deletion(-)
> >
> > diff --git a/resize/resize2fs.c b/resize/resize2fs.c
> > index dc2805d..1dce498 100644
> > --- a/resize/resize2fs.c
> > +++ b/resize/resize2fs.c
> > @@ -1890,6 +1890,8 @@ blk64_t calculate_minimum_resize_size(ext2_filsys
> > fs)
> >         blk64_t grp, data_needed, last_start;
> >         blk64_t overhead = 0;
> >         int num_of_superblocks = 0;
> > +       blk64_t super_overhead = 0;
> > +       int old_desc_blocks;
> >         int extra_groups = 0;
> >         int flexbg_size = 1 << fs->super->s_log_groups_per_flex;
> >
> > @@ -1909,15 +1911,36 @@ blk64_t
> > calculate_minimum_resize_size(ext2_filsys fs)
> >          * we need to figure out how many backup superblocks we have so
> > we can
> >          * account for that in the metadata
> >          */
> > +       if (fs->super->s_feature_incompat &
> > EXT2_FEATURE_INCOMPAT_META_BG)
> > +               old_desc_blocks = fs->super->s_first_meta_bg;
> > +       else
> > +               old_desc_blocks = fs->desc_blocks +
> > +                       fs->super->s_reserved_gdt_blocks;
> > +
> >         for (grp = 0; grp < fs->group_desc_count; grp++) {
> > +               blk64_t super_blk, old_desc_blk, new_desc_blk;
> > +               int has_super;
> > +
> > +               ext2fs_super_and_bgd_loc2(fs, grp, &super_blk,
> > +                                         &old_desc_blk, &new_desc_blk,
> > 0);
> > +               has_super = ((grp == 0) || super_blk);
> > +               if (has_super)
> > +                       super_overhead++;
> > +               if (old_desc_blk)
> > +                       super_overhead += old_desc_blocks;
> > +               else if (new_desc_blk)
> > +                       super_overhead++;
> >                 if (ext2fs_bg_has_super(fs, grp))
> >                         num_of_superblocks++;
> > +
> >         }
> > +       printf("super overhead is %llu, old algorithm was %llu\n",
> > +              super_overhead, SUPER_OVERHEAD(fs) * num_of_superblocks);
> >
> >         /* calculate how many blocks are needed for data */
> >         data_needed = ext2fs_blocks_count(fs->super) -
> >                 ext2fs_free_blocks_count(fs->super);
> > -       data_needed -= SUPER_OVERHEAD(fs) * num_of_superblocks;
> > +       data_needed -= super_overhead;
> >         data_needed -= META_OVERHEAD(fs) * fs->group_desc_count;
> >
> >         if (fs->super->s_feature_incompat &
> > EXT4_FEATURE_INCOMPAT_FLEX_BG) {
> > --
> > 1.7.12.rc0.22.gcdd159b
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
>
> --
> Best Wishes
> Yongqiang Yang
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o Sept. 4, 2012, 2:14 a.m. UTC | #4
On Tue, Sep 04, 2012 at 09:59:55AM +0800, Yongqiang Yang wrote:
> Hi Kevin,
> 
>   Ted has sent out the patches on online resizing for meta_bg and
> 64bits, so you can have a try again. It seems that the bug in
> e2fsprogs has been fixed.

Make sure you use the latest version of the kernel patches that I just
sent out.  There quite a number of bugs in the Yongqiang's original
patch set which I tripped over while I was testing 64-bit resize ---
and please note that there are definitely still rough edges
(especially for in cases where the file system was created < 16TB, but
with the 64-bit feature and resize_inode features enabled).  There may
also be bugs for the straightforward case of resizing very large file
systems.

So while I very much appreciate users giving the code a try and
sending us feedback, please do think twice before using this code on
file systems with data that hasn't been backed up recently.  (Of
course, being good System Administrators you are all keeping --- and
verifying --- regular backups, right?  :-)

Regards,

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anssi Hannula Sept. 4, 2012, 5:05 p.m. UTC | #5
04.09.2012 05:14, Theodore Ts'o kirjoitti:
> On Tue, Sep 04, 2012 at 09:59:55AM +0800, Yongqiang Yang wrote:
>> Hi Kevin,
>>
>>   Ted has sent out the patches on online resizing for meta_bg and
>> 64bits, so you can have a try again. It seems that the bug in
>> e2fsprogs has been fixed.
> 
> Make sure you use the latest version of the kernel patches that I just
> sent out.  There quite a number of bugs in the Yongqiang's original
> patch set which I tripped over while I was testing 64-bit resize ---
> and please note that there are definitely still rough edges
> (especially for in cases where the file system was created < 16TB, but
> with the 64-bit feature and resize_inode features enabled).  There may
> also be bugs for the straightforward case of resizing very large file
> systems.

Indeed, I hit a BUG_ON() on resize from 8589934590 blocks to 8589934640
blocks (4k):
 [  676.140165] ------------[ cut here ]------------
 [  676.150026] kernel BUG at fs/ext4/resize.c:255!
 [  676.150026] invalid opcode: 0000 [#1] SMP
 [  676.150026] CPU 0
 [  676.150026] Modules linked in:[  676.150026]  dm_snapshot dm_zero
af_packet dm_mod joydev hid_generic ppdev snd_intel8x0 snd_ac97_codec
ac97_bus usbhid microcode e1000 snd_pcm snd_page_alloc snd_timer hid
i2c_piix4 i2c_core button snd soundcore ac parport_pc parport processor
evdev ipv6 autofs4 ext4 crc16 jbd2 ohci_hcd sd_mod crc_t10dif usbcore
usb_common sr_mod ata_piix ahci libahci libata scsi_mod [last unloaded:
nf_defrag_ipv4]

 [  676.150026] Pid: 1793, comm: resize2fs Not tainted
3.5.3-server-2anssi.9.ext4.10.2 #1 innotek GmbH VirtualBox
 [  676.150026] RIP: 0010:[<ffffffffa014e9bc>]  [<ffffffffa014e9bc>]
ext4_resize_fs+0x94c/0xa30 [ext4]
 [  676.150026] RSP: 0018:ffff880046eedd18  EFLAGS: 00010246
 [  676.150026] RAX: 0000000000040001 RBX: ffff88005b799800 RCX:
0000000000000001
 [  676.150026] RDX: 0000000000081bf1 RSI: 0000000000040001 RDI:
ffff88005b068000
 [  676.150026] RBP: ffff880046eeddd8 R08: 0000000200000003 R09:
0000000000000000
 [  676.150026] R10: 0000000000000000 R11: 0000000080042000 R12:
0000000000040001
 [  676.150026] R13: ffff880037fb5e20 R14: 0000000000000000 R15:
ffff88005b068000
 [  676.150026] FS:  00007fb43e067740(0000) GS:ffff88005fc00000(0000)
knlGS:0000000000000000
 [  676.150026] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
 [  676.150026] CR2: 00007fd905261178 CR3: 0000000044993000 CR4:
00000000000006f0
 [  676.150026] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
 [  676.150026] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
0000000000000400
 [  676.150026] Process resize2fs (pid: 1793, threadinfo
ffff880046eec000, task ffff880053f824c0)
 [  676.150026] Stack:
 [  676.150026]  ffff880046eedda8 ffffffff8117971e 0000000f53529c40
0000000000000000
 [  676.150026]  000000020000002f 0000000000000000 0000000200000030
ffff88005b20e990
 [  676.150026]  0000000100000001 ffff880000000001 0000000200000000
0000000200000000
 [  676.150026] Call Trace:
 [  676.150026]  [<ffffffff8117971e>] ? do_last+0x2ee/0x9f0
 [  676.150026]  [<ffffffffa012e05f>] ext4_ioctl+0x9af/0xbc0 [ext4]
 [  676.150026]  [<ffffffff8117db6f>] do_vfs_ioctl+0x8f/0x4e0
 [  676.150026]  [<ffffffff8117e051>] sys_ioctl+0x91/0xa0
 [  676.150026]  [<ffffffff8147d0bd>] system_call_fastpath+0x1a/0x1f
 [  676.150026] Code: c7 c1 60 2c 17 a0 ba 0c 07 00 00 48 c7 c6 9b e8 16
a0 4c 89 e7 31 c0 e8 b3 80 ff ff c7 85 6c ff ff ff ea ff ff ff e9 4c f8
ff ff <0f> 0b 8b 55 a4 8b 45 a0 f7 da 44 31 e0 85 c2 0f 84 6c fb ff ff
 [  676.150026] RIP  [<ffffffffa014e9bc>] ext4_resize_fs+0x94c/0xa30 [ext4]
 [  676.150026]  RSP <ffff880046eedd18>
 [  676.788513] ---[ end trace fbf2bd5a59c2ab99 ]---

This is BUG_ON(src_group >= group_data[0].group + flex_gd->count);

I was using the below basic test script which uses a virtual large
volume in LVM (e2fsprogs is 1.42.5, except for resize2fs):

#!/bin/bash -ex

VG=delta
LV=ext4test
LVSIZE=40T
MOUNTPOINT="/mnt/iso"
RESIZE2FS=/root/resize2fs

INITIAL_SIZE_K=4294967295
NEW_BLOCKS=8589934590

lvcreate -l 100%FREE -V "$LVSIZE" -n "$LV" "$VG"
mkfs.ext4 -O meta_bg,64bit,^resize_inode "/dev/$VG/$LV" "$INITIAL_SIZE_K"

mount "/dev/$VG/$LV" "$MOUNTPOINT"

mkdir "$MOUNTPOINT/test"
for file in 1 2; do
        dd if=/dev/urandom bs=1M count=50 of="$MOUNTPOINT/test/$file"
done
md5sum $MOUNTPOINT/test/* > $MOUNTPOINT/MD5SUM

for N in $NEW_BLOCKS $((NEW_BLOCKS + 50)); do
        $RESIZE2FS "/dev/$VG/$LV" "$N"

        umount "$MOUNTPOINT"
        fsck.ext4 -nvf "/dev/$VG/$LV"
        mount "/dev/$VG/$LV" "$MOUNTPOINT"
        md5sum -c "$MOUNTPOINT/MD5SUM"
done



> So while I very much appreciate users giving the code a try and
> sending us feedback, please do think twice before using this code on
> file systems with data that hasn't been backed up recently.  (Of
> course, being good System Administrators you are all keeping --- and
> verifying --- regular backups, right?  :-)
Yongqiang Yang Sept. 5, 2012, 2:10 a.m. UTC | #6
Hi Anssi,

The bug was fixed for a while, please check the patches:
[PATCH 1/2] ext4: teach resize report old blocks count correctly
[PATCH 2/2] ext4: ignore last group without enough space when resizing

Please have a try!!!

Thanks,
Yongqiang.

On Wed, Sep 5, 2012 at 1:05 AM, Anssi Hannula <anssi.hannula@iki.fi> wrote:
> 04.09.2012 05:14, Theodore Ts'o kirjoitti:
>> On Tue, Sep 04, 2012 at 09:59:55AM +0800, Yongqiang Yang wrote:
>>> Hi Kevin,
>>>
>>>   Ted has sent out the patches on online resizing for meta_bg and
>>> 64bits, so you can have a try again. It seems that the bug in
>>> e2fsprogs has been fixed.
>>
>> Make sure you use the latest version of the kernel patches that I just
>> sent out.  There quite a number of bugs in the Yongqiang's original
>> patch set which I tripped over while I was testing 64-bit resize ---
>> and please note that there are definitely still rough edges
>> (especially for in cases where the file system was created < 16TB, but
>> with the 64-bit feature and resize_inode features enabled).  There may
>> also be bugs for the straightforward case of resizing very large file
>> systems.
>
> Indeed, I hit a BUG_ON() on resize from 8589934590 blocks to 8589934640
> blocks (4k):
>  [  676.140165] ------------[ cut here ]------------
>  [  676.150026] kernel BUG at fs/ext4/resize.c:255!
>  [  676.150026] invalid opcode: 0000 [#1] SMP
>  [  676.150026] CPU 0
>  [  676.150026] Modules linked in:[  676.150026]  dm_snapshot dm_zero
> af_packet dm_mod joydev hid_generic ppdev snd_intel8x0 snd_ac97_codec
> ac97_bus usbhid microcode e1000 snd_pcm snd_page_alloc snd_timer hid
> i2c_piix4 i2c_core button snd soundcore ac parport_pc parport processor
> evdev ipv6 autofs4 ext4 crc16 jbd2 ohci_hcd sd_mod crc_t10dif usbcore
> usb_common sr_mod ata_piix ahci libahci libata scsi_mod [last unloaded:
> nf_defrag_ipv4]
>
>  [  676.150026] Pid: 1793, comm: resize2fs Not tainted
> 3.5.3-server-2anssi.9.ext4.10.2 #1 innotek GmbH VirtualBox
>  [  676.150026] RIP: 0010:[<ffffffffa014e9bc>]  [<ffffffffa014e9bc>]
> ext4_resize_fs+0x94c/0xa30 [ext4]
>  [  676.150026] RSP: 0018:ffff880046eedd18  EFLAGS: 00010246
>  [  676.150026] RAX: 0000000000040001 RBX: ffff88005b799800 RCX:
> 0000000000000001
>  [  676.150026] RDX: 0000000000081bf1 RSI: 0000000000040001 RDI:
> ffff88005b068000
>  [  676.150026] RBP: ffff880046eeddd8 R08: 0000000200000003 R09:
> 0000000000000000
>  [  676.150026] R10: 0000000000000000 R11: 0000000080042000 R12:
> 0000000000040001
>  [  676.150026] R13: ffff880037fb5e20 R14: 0000000000000000 R15:
> ffff88005b068000
>  [  676.150026] FS:  00007fb43e067740(0000) GS:ffff88005fc00000(0000)
> knlGS:0000000000000000
>  [  676.150026] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
>  [  676.150026] CR2: 00007fd905261178 CR3: 0000000044993000 CR4:
> 00000000000006f0
>  [  676.150026] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> 0000000000000000
>  [  676.150026] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
> 0000000000000400
>  [  676.150026] Process resize2fs (pid: 1793, threadinfo
> ffff880046eec000, task ffff880053f824c0)
>  [  676.150026] Stack:
>  [  676.150026]  ffff880046eedda8 ffffffff8117971e 0000000f53529c40
> 0000000000000000
>  [  676.150026]  000000020000002f 0000000000000000 0000000200000030
> ffff88005b20e990
>  [  676.150026]  0000000100000001 ffff880000000001 0000000200000000
> 0000000200000000
>  [  676.150026] Call Trace:
>  [  676.150026]  [<ffffffff8117971e>] ? do_last+0x2ee/0x9f0
>  [  676.150026]  [<ffffffffa012e05f>] ext4_ioctl+0x9af/0xbc0 [ext4]
>  [  676.150026]  [<ffffffff8117db6f>] do_vfs_ioctl+0x8f/0x4e0
>  [  676.150026]  [<ffffffff8117e051>] sys_ioctl+0x91/0xa0
>  [  676.150026]  [<ffffffff8147d0bd>] system_call_fastpath+0x1a/0x1f
>  [  676.150026] Code: c7 c1 60 2c 17 a0 ba 0c 07 00 00 48 c7 c6 9b e8 16
> a0 4c 89 e7 31 c0 e8 b3 80 ff ff c7 85 6c ff ff ff ea ff ff ff e9 4c f8
> ff ff <0f> 0b 8b 55 a4 8b 45 a0 f7 da 44 31 e0 85 c2 0f 84 6c fb ff ff
>  [  676.150026] RIP  [<ffffffffa014e9bc>] ext4_resize_fs+0x94c/0xa30 [ext4]
>  [  676.150026]  RSP <ffff880046eedd18>
>  [  676.788513] ---[ end trace fbf2bd5a59c2ab99 ]---
>
> This is BUG_ON(src_group >= group_data[0].group + flex_gd->count);
>
> I was using the below basic test script which uses a virtual large
> volume in LVM (e2fsprogs is 1.42.5, except for resize2fs):
>
> #!/bin/bash -ex
>
> VG=delta
> LV=ext4test
> LVSIZE=40T
> MOUNTPOINT="/mnt/iso"
> RESIZE2FS=/root/resize2fs
>
> INITIAL_SIZE_K=4294967295
> NEW_BLOCKS=8589934590
>
> lvcreate -l 100%FREE -V "$LVSIZE" -n "$LV" "$VG"
> mkfs.ext4 -O meta_bg,64bit,^resize_inode "/dev/$VG/$LV" "$INITIAL_SIZE_K"
>
> mount "/dev/$VG/$LV" "$MOUNTPOINT"
>
> mkdir "$MOUNTPOINT/test"
> for file in 1 2; do
>         dd if=/dev/urandom bs=1M count=50 of="$MOUNTPOINT/test/$file"
> done
> md5sum $MOUNTPOINT/test/* > $MOUNTPOINT/MD5SUM
>
> for N in $NEW_BLOCKS $((NEW_BLOCKS + 50)); do
>         $RESIZE2FS "/dev/$VG/$LV" "$N"
>
>         umount "$MOUNTPOINT"
>         fsck.ext4 -nvf "/dev/$VG/$LV"
>         mount "/dev/$VG/$LV" "$MOUNTPOINT"
>         md5sum -c "$MOUNTPOINT/MD5SUM"
> done
>
>
>
>> So while I very much appreciate users giving the code a try and
>> sending us feedback, please do think twice before using this code on
>> file systems with data that hasn't been backed up recently.  (Of
>> course, being good System Administrators you are all keeping --- and
>> verifying --- regular backups, right?  :-)
>
> --
> Anssi Hannula
Theodore Ts'o Sept. 5, 2012, 4:55 a.m. UTC | #7
On Wed, Sep 05, 2012 at 10:10:29AM +0800, Yongqiang Yang wrote:
> Hi Anssi,
> 
> The bug was fixed for a while, please check the patches:
> [PATCH 1/2] ext4: teach resize report old blocks count correctly
> [PATCH 2/2] ext4: ignore last group without enough space when resizing
> 
> Please have a try!!!

Yongqiang,

In the future, if a patch is going to fix a BUG_ON or kernel crash,
please state so explicitly in the commit description along with
instructions about how to reproduce the problem.  The urgency of a
patch which (for example) fixes a debugging printk (such as your 1/2
patch above) is quite different from a patch which causes a kernel
BUG_ON.

One of the reasons why I hadn't gotten around to processing your
patches until now was partially because I knew there was a lot of
testing and fixing before the patches were fully baked (as soon as I
started doing testing I found all sorts of other problems, which I had
to fix), but also because the commit descriptions were not clear
enough.

Patches where it's obvious what they fix, and where there is a clear
explanation about what they fix and the priority of their fix makes
life easier for me, and makes it more likely that I can process the
patches quickly.

Also, if you have a follow-on set of patches which is dependent on the
initila set of patches, it's very helpful to resend a v2 version of
the patches so that it's clear how the patches fit together.

I'll take care of these two extra patches, and then you'll see me send
out a -v2 set of the patches which contain all of the online resize
patches rebased to the latest kernel and tested as much as possible.
In general, though, in order for me to scale, I really need ext4
developers to do as much of this testing, rebasing, and reposting
patches as possible, and for other ext4 developers to review the
patches.  If I have to do all of this myself, patches will flow into
mainline more slowly, and we'll start accumulating a much longer
backlog.

Regards,

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yongqiang Yang Sept. 5, 2012, 5:16 a.m. UTC | #8
Got it!

Thanks,
Yongqiang.

On Wed, Sep 5, 2012 at 12:55 PM, Theodore Ts'o <tytso@mit.edu> wrote:
> On Wed, Sep 05, 2012 at 10:10:29AM +0800, Yongqiang Yang wrote:
>> Hi Anssi,
>>
>> The bug was fixed for a while, please check the patches:
>> [PATCH 1/2] ext4: teach resize report old blocks count correctly
>> [PATCH 2/2] ext4: ignore last group without enough space when resizing
>>
>> Please have a try!!!
>
> Yongqiang,
>
> In the future, if a patch is going to fix a BUG_ON or kernel crash,
> please state so explicitly in the commit description along with
> instructions about how to reproduce the problem.  The urgency of a
> patch which (for example) fixes a debugging printk (such as your 1/2
> patch above) is quite different from a patch which causes a kernel
> BUG_ON.
>
> One of the reasons why I hadn't gotten around to processing your
> patches until now was partially because I knew there was a lot of
> testing and fixing before the patches were fully baked (as soon as I
> started doing testing I found all sorts of other problems, which I had
> to fix), but also because the commit descriptions were not clear
> enough.
>
> Patches where it's obvious what they fix, and where there is a clear
> explanation about what they fix and the priority of their fix makes
> life easier for me, and makes it more likely that I can process the
> patches quickly.
>
> Also, if you have a follow-on set of patches which is dependent on the
> initila set of patches, it's very helpful to resend a v2 version of
> the patches so that it's clear how the patches fit together.
>
> I'll take care of these two extra patches, and then you'll see me send
> out a -v2 set of the patches which contain all of the online resize
> patches rebased to the latest kernel and tested as much as possible.
> In general, though, in order for me to scale, I really need ext4
> developers to do as much of this testing, rebasing, and reposting
> patches as possible, and for other ext4 developers to review the
> patches.  If I have to do all of this myself, patches will flow into
> mainline more slowly, and we'll start accumulating a much longer
> backlog.
>
> Regards,
>
>                                                 - Ted
Theodore Ts'o Sept. 5, 2012, 5:38 a.m. UTC | #9
BTW, it looks like your 2/2 patch does not have a dependency on the
rest of the resize patches, and fixes a problem which exists today
with the flex_bg resizing.  So you'll see I moved it to the beginning
of the patch series, and added a "Cc: stable@vger.kernel.org", since
it's a patch that should be backported to older stable kernels.

This kind of applicability statement is helpful for me, since I don't
have to try to figure it out (and because I might get it wrong as I
try to figure out the reasoning behind a patch, and the priority of
the patch).  Things that are useful to include is whether it is fixing
a recent regression, or if it is fixing a bug that is in older
kernels, how long has the bug been present, so we know which older
kernels need the patch, and in particular, whether enterprise
distributions need to worry about backporting the patch.

And as I mentioned earlier, if it causes user data loss/corruption, or
causes the kernel to crash, please make a note of this in the commit
description, since that's also important information when trying to
decide if a patch needs priority handling or needs to be backported to
older stable kernels.

Thanks, regards,

					- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kevin Liao Sept. 5, 2012, 6:32 a.m. UTC | #10
2012/9/4 Theodore Ts'o <tytso@mit.edu>:
> On Tue, Sep 04, 2012 at 09:59:55AM +0800, Yongqiang Yang wrote:
>> Hi Kevin,
>>
>>   Ted has sent out the patches on online resizing for meta_bg and
>> 64bits, so you can have a try again. It seems that the bug in
>> e2fsprogs has been fixed.
>
> Make sure you use the latest version of the kernel patches that I just
> sent out.  There quite a number of bugs in the Yongqiang's original
> patch set which I tripped over while I was testing 64-bit resize ---
> and please note that there are definitely still rough edges
> (especially for in cases where the file system was created < 16TB, but
> with the 64-bit feature and resize_inode features enabled).  There may
> also be bugs for the straightforward case of resizing very large file
> systems.
>
> So while I very much appreciate users giving the code a try and
> sending us feedback, please do think twice before using this code on
> file systems with data that hasn't been backed up recently.  (Of
> course, being good System Administrators you are all keeping --- and
> verifying --- regular backups, right?  :-)
>
> Regards,
>
>                                                 - Ted

I had done some simple and quick test. The following is the result.

Kernel: 3.4.7 + 5 patches
e2fsprogs: 1.42.5 + 2 patches

The format command I used is:
mke2fs -t ext4 -m0 -b 4096 -F -O 64bit,meta_bg,^resize_inode /dev/md0 nnnn

Case 1: Simplly resize
1st step: resize from 14T to 18T => ok
2nd step: resize from 18T to 20T => ok (calculate_minimum_resize_size
issue gone)
3rd step: resize from 20T to 21T => ok

Case 2: case 1 + file read-write (just like Anssi did)
1st step: resize from 14T to 20T (5368709120 blocks) => ok
2nd step: resize from 20T to 5368709170 blocks => same kernel bug_on

Case 3: case 2 + Yongqiang's 2 patches
1st step: resize from 14T to 20T (5368709120 blocks) => ok
2nd step: resize from 20T to 5368709170 blocks => ok

Basically I think the resize funtionality should be ok. However I also
observe some performance drop. That is, the time needed for mke2fs,
mount and e2fsck are longer than before. Here is some detailed data:

For 12TB with 64bit,meta_bg,^resize_inode
mke2fs: 54.699s
mount: 12.108s
e2fsck: 1m52.027s

For 12TB without 64bit,meta_bg,^resize_inode
mke2fs: 39.763s
mount: 0.897s
e2fsck: 1m17.554s

For 20TB with 64bit,meta_bg,^resize_inode
mke2fs: 1m25.090s
mount: 19.992s
e2fsck: 2m55.048s

For 20TB without 64bit,meta_bg,^resize_inode
mke2fs: 1m3.660s
mount: 1.458s
e2fsck: 1m56.055s

Yongqiang had told me previously that it may be caused by using
meta_bg. I am still wondering is there anything we can do to improve
the peroformance? Thanks a lot.

Regards,
Kevin Liao
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yongqiang Yang Sept. 5, 2012, 6:44 a.m. UTC | #11
On Wed, Sep 5, 2012 at 2:32 PM, Kevin Liao <kevinlia@gmail.com> wrote:
> 2012/9/4 Theodore Ts'o <tytso@mit.edu>:
>> On Tue, Sep 04, 2012 at 09:59:55AM +0800, Yongqiang Yang wrote:
>>> Hi Kevin,
>>>
>>>   Ted has sent out the patches on online resizing for meta_bg and
>>> 64bits, so you can have a try again. It seems that the bug in
>>> e2fsprogs has been fixed.
>>
>> Make sure you use the latest version of the kernel patches that I just
>> sent out.  There quite a number of bugs in the Yongqiang's original
>> patch set which I tripped over while I was testing 64-bit resize ---
>> and please note that there are definitely still rough edges
>> (especially for in cases where the file system was created < 16TB, but
>> with the 64-bit feature and resize_inode features enabled).  There may
>> also be bugs for the straightforward case of resizing very large file
>> systems.
>>
>> So while I very much appreciate users giving the code a try and
>> sending us feedback, please do think twice before using this code on
>> file systems with data that hasn't been backed up recently.  (Of
>> course, being good System Administrators you are all keeping --- and
>> verifying --- regular backups, right?  :-)
>>
>> Regards,
>>
>>                                                 - Ted
>
> I had done some simple and quick test. The following is the result.
>
> Kernel: 3.4.7 + 5 patches
> e2fsprogs: 1.42.5 + 2 patches
>
> The format command I used is:
> mke2fs -t ext4 -m0 -b 4096 -F -O 64bit,meta_bg,^resize_inode /dev/md0 nnnn
>
> Case 1: Simplly resize
> 1st step: resize from 14T to 18T => ok
> 2nd step: resize from 18T to 20T => ok (calculate_minimum_resize_size
> issue gone)
> 3rd step: resize from 20T to 21T => ok
>
> Case 2: case 1 + file read-write (just like Anssi did)
> 1st step: resize from 14T to 20T (5368709120 blocks) => ok
> 2nd step: resize from 20T to 5368709170 blocks => same kernel bug_on
>
> Case 3: case 2 + Yongqiang's 2 patches
> 1st step: resize from 14T to 20T (5368709120 blocks) => ok
> 2nd step: resize from 20T to 5368709170 blocks => ok
>
> Basically I think the resize funtionality should be ok. However I also
> observe some performance drop. That is, the time needed for mke2fs,
> mount and e2fsck are longer than before. Here is some detailed data:
>
> For 12TB with 64bit,meta_bg,^resize_inode
> mke2fs: 54.699s
> mount: 12.108s
> e2fsck: 1m52.027s
>
> For 12TB without 64bit,meta_bg,^resize_inode
Did you mean without 64bit and without meta_bg OR with without 64bit
and with meta_bg?

I am guessing you meant without 64bit and without meta_bg, am I right?
Yongqiang.
> mke2fs: 39.763s
> mount: 0.897s
> e2fsck: 1m17.554s
>
> For 20TB with 64bit,meta_bg,^resize_inode
> mke2fs: 1m25.090s
> mount: 19.992s
> e2fsck: 2m55.048s
>
> For 20TB without 64bit,meta_bg,^resize_inode
> mke2fs: 1m3.660s
> mount: 1.458s
> e2fsck: 1m56.055s
>
> Yongqiang had told me previously that it may be caused by using
> meta_bg. I am still wondering is there anything we can do to improve
> the peroformance? Thanks a lot.
>
> Regards,
> Kevin Liao
Kevin Liao Sept. 5, 2012, 6:50 a.m. UTC | #12
2012/9/5 Yongqiang Yang <xiaoqiangnk@gmail.com>:
> On Wed, Sep 5, 2012 at 2:32 PM, Kevin Liao <kevinlia@gmail.com> wrote:
>> 2012/9/4 Theodore Ts'o <tytso@mit.edu>:
>>> On Tue, Sep 04, 2012 at 09:59:55AM +0800, Yongqiang Yang wrote:
>>>> Hi Kevin,
>>>>
>>>>   Ted has sent out the patches on online resizing for meta_bg and
>>>> 64bits, so you can have a try again. It seems that the bug in
>>>> e2fsprogs has been fixed.
>>>
>>> Make sure you use the latest version of the kernel patches that I just
>>> sent out.  There quite a number of bugs in the Yongqiang's original
>>> patch set which I tripped over while I was testing 64-bit resize ---
>>> and please note that there are definitely still rough edges
>>> (especially for in cases where the file system was created < 16TB, but
>>> with the 64-bit feature and resize_inode features enabled).  There may
>>> also be bugs for the straightforward case of resizing very large file
>>> systems.
>>>
>>> So while I very much appreciate users giving the code a try and
>>> sending us feedback, please do think twice before using this code on
>>> file systems with data that hasn't been backed up recently.  (Of
>>> course, being good System Administrators you are all keeping --- and
>>> verifying --- regular backups, right?  :-)
>>>
>>> Regards,
>>>
>>>                                                 - Ted
>>
>> I had done some simple and quick test. The following is the result.
>>
>> Kernel: 3.4.7 + 5 patches
>> e2fsprogs: 1.42.5 + 2 patches
>>
>> The format command I used is:
>> mke2fs -t ext4 -m0 -b 4096 -F -O 64bit,meta_bg,^resize_inode /dev/md0 nnnn
>>
>> Case 1: Simplly resize
>> 1st step: resize from 14T to 18T => ok
>> 2nd step: resize from 18T to 20T => ok (calculate_minimum_resize_size
>> issue gone)
>> 3rd step: resize from 20T to 21T => ok
>>
>> Case 2: case 1 + file read-write (just like Anssi did)
>> 1st step: resize from 14T to 20T (5368709120 blocks) => ok
>> 2nd step: resize from 20T to 5368709170 blocks => same kernel bug_on
>>
>> Case 3: case 2 + Yongqiang's 2 patches
>> 1st step: resize from 14T to 20T (5368709120 blocks) => ok
>> 2nd step: resize from 20T to 5368709170 blocks => ok
>>
>> Basically I think the resize funtionality should be ok. However I also
>> observe some performance drop. That is, the time needed for mke2fs,
>> mount and e2fsck are longer than before. Here is some detailed data:
>>
>> For 12TB with 64bit,meta_bg,^resize_inode
>> mke2fs: 54.699s
>> mount: 12.108s
>> e2fsck: 1m52.027s
>>
>> For 12TB without 64bit,meta_bg,^resize_inode
> Did you mean without 64bit and without meta_bg OR with without 64bit
> and with meta_bg?
>
> I am guessing you meant without 64bit and without meta_bg, am I right?
> Yongqiang.

What I mean with 64bit,meta_bg,^resize_inode is to use the following
format command
mke2fs -t ext4 -m0 -b 4096 -F -O 64bit,meta_bg,^resize_inode /dev/md0 3758096384

And without 64bit,meta_bg,^resize_inode means
mke2fs -t ext4 -m0 -b 4096 -F /dev/md0 3758096384

Regards,
Kevin Liao

>> mke2fs: 39.763s
>> mount: 0.897s
>> e2fsck: 1m17.554s
>>
>> For 20TB with 64bit,meta_bg,^resize_inode
>> mke2fs: 1m25.090s
>> mount: 19.992s
>> e2fsck: 2m55.048s
>>
>> For 20TB without 64bit,meta_bg,^resize_inode
>> mke2fs: 1m3.660s
>> mount: 1.458s
>> e2fsck: 1m56.055s
>>
>> Yongqiang had told me previously that it may be caused by using
>> meta_bg. I am still wondering is there anything we can do to improve
>> the peroformance? Thanks a lot.
>>
>> Regards,
>> Kevin Liao
>
>
>
> --
> Best Wishes
> Yongqiang Yang
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anssi Hannula Sept. 6, 2012, 2:22 p.m. UTC | #13
05.09.2012 05:10, Yongqiang Yang kirjoitti:
> Hi Anssi,

Hi,

> The bug was fixed for a while, please check the patches:
> [PATCH 1/2] ext4: teach resize report old blocks count correctly
> [PATCH 2/2] ext4: ignore last group without enough space when resizing
> 
> Please have a try!!!

Confirmed that with these patches the simple test passes :)


> Thanks,
> Yongqiang.
> 
> On Wed, Sep 5, 2012 at 1:05 AM, Anssi Hannula <anssi.hannula@iki.fi> wrote:
>> 04.09.2012 05:14, Theodore Ts'o kirjoitti:
>>> On Tue, Sep 04, 2012 at 09:59:55AM +0800, Yongqiang Yang wrote:
>>>> Hi Kevin,
>>>>
>>>>   Ted has sent out the patches on online resizing for meta_bg and
>>>> 64bits, so you can have a try again. It seems that the bug in
>>>> e2fsprogs has been fixed.
>>>
>>> Make sure you use the latest version of the kernel patches that I just
>>> sent out.  There quite a number of bugs in the Yongqiang's original
>>> patch set which I tripped over while I was testing 64-bit resize ---
>>> and please note that there are definitely still rough edges
>>> (especially for in cases where the file system was created < 16TB, but
>>> with the 64-bit feature and resize_inode features enabled).  There may
>>> also be bugs for the straightforward case of resizing very large file
>>> systems.
>>
>> Indeed, I hit a BUG_ON() on resize from 8589934590 blocks to 8589934640
>> blocks (4k):
>>  [  676.140165] ------------[ cut here ]------------
>>  [  676.150026] kernel BUG at fs/ext4/resize.c:255!
>>  [  676.150026] invalid opcode: 0000 [#1] SMP
>>  [  676.150026] CPU 0
>>  [  676.150026] Modules linked in:[  676.150026]  dm_snapshot dm_zero
>> af_packet dm_mod joydev hid_generic ppdev snd_intel8x0 snd_ac97_codec
>> ac97_bus usbhid microcode e1000 snd_pcm snd_page_alloc snd_timer hid
>> i2c_piix4 i2c_core button snd soundcore ac parport_pc parport processor
>> evdev ipv6 autofs4 ext4 crc16 jbd2 ohci_hcd sd_mod crc_t10dif usbcore
>> usb_common sr_mod ata_piix ahci libahci libata scsi_mod [last unloaded:
>> nf_defrag_ipv4]
>>
>>  [  676.150026] Pid: 1793, comm: resize2fs Not tainted
>> 3.5.3-server-2anssi.9.ext4.10.2 #1 innotek GmbH VirtualBox
>>  [  676.150026] RIP: 0010:[<ffffffffa014e9bc>]  [<ffffffffa014e9bc>]
>> ext4_resize_fs+0x94c/0xa30 [ext4]
>>  [  676.150026] RSP: 0018:ffff880046eedd18  EFLAGS: 00010246
>>  [  676.150026] RAX: 0000000000040001 RBX: ffff88005b799800 RCX:
>> 0000000000000001
>>  [  676.150026] RDX: 0000000000081bf1 RSI: 0000000000040001 RDI:
>> ffff88005b068000
>>  [  676.150026] RBP: ffff880046eeddd8 R08: 0000000200000003 R09:
>> 0000000000000000
>>  [  676.150026] R10: 0000000000000000 R11: 0000000080042000 R12:
>> 0000000000040001
>>  [  676.150026] R13: ffff880037fb5e20 R14: 0000000000000000 R15:
>> ffff88005b068000
>>  [  676.150026] FS:  00007fb43e067740(0000) GS:ffff88005fc00000(0000)
>> knlGS:0000000000000000
>>  [  676.150026] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
>>  [  676.150026] CR2: 00007fd905261178 CR3: 0000000044993000 CR4:
>> 00000000000006f0
>>  [  676.150026] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
>> 0000000000000000
>>  [  676.150026] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
>> 0000000000000400
>>  [  676.150026] Process resize2fs (pid: 1793, threadinfo
>> ffff880046eec000, task ffff880053f824c0)
>>  [  676.150026] Stack:
>>  [  676.150026]  ffff880046eedda8 ffffffff8117971e 0000000f53529c40
>> 0000000000000000
>>  [  676.150026]  000000020000002f 0000000000000000 0000000200000030
>> ffff88005b20e990
>>  [  676.150026]  0000000100000001 ffff880000000001 0000000200000000
>> 0000000200000000
>>  [  676.150026] Call Trace:
>>  [  676.150026]  [<ffffffff8117971e>] ? do_last+0x2ee/0x9f0
>>  [  676.150026]  [<ffffffffa012e05f>] ext4_ioctl+0x9af/0xbc0 [ext4]
>>  [  676.150026]  [<ffffffff8117db6f>] do_vfs_ioctl+0x8f/0x4e0
>>  [  676.150026]  [<ffffffff8117e051>] sys_ioctl+0x91/0xa0
>>  [  676.150026]  [<ffffffff8147d0bd>] system_call_fastpath+0x1a/0x1f
>>  [  676.150026] Code: c7 c1 60 2c 17 a0 ba 0c 07 00 00 48 c7 c6 9b e8 16
>> a0 4c 89 e7 31 c0 e8 b3 80 ff ff c7 85 6c ff ff ff ea ff ff ff e9 4c f8
>> ff ff <0f> 0b 8b 55 a4 8b 45 a0 f7 da 44 31 e0 85 c2 0f 84 6c fb ff ff
>>  [  676.150026] RIP  [<ffffffffa014e9bc>] ext4_resize_fs+0x94c/0xa30 [ext4]
>>  [  676.150026]  RSP <ffff880046eedd18>
>>  [  676.788513] ---[ end trace fbf2bd5a59c2ab99 ]---
>>
>> This is BUG_ON(src_group >= group_data[0].group + flex_gd->count);
>>
>> I was using the below basic test script which uses a virtual large
>> volume in LVM (e2fsprogs is 1.42.5, except for resize2fs):
>>
>> #!/bin/bash -ex
>>
>> VG=delta
>> LV=ext4test
>> LVSIZE=40T
>> MOUNTPOINT="/mnt/iso"
>> RESIZE2FS=/root/resize2fs
>>
>> INITIAL_SIZE_K=4294967295
>> NEW_BLOCKS=8589934590
>>
>> lvcreate -l 100%FREE -V "$LVSIZE" -n "$LV" "$VG"
>> mkfs.ext4 -O meta_bg,64bit,^resize_inode "/dev/$VG/$LV" "$INITIAL_SIZE_K"
>>
>> mount "/dev/$VG/$LV" "$MOUNTPOINT"
>>
>> mkdir "$MOUNTPOINT/test"
>> for file in 1 2; do
>>         dd if=/dev/urandom bs=1M count=50 of="$MOUNTPOINT/test/$file"
>> done
>> md5sum $MOUNTPOINT/test/* > $MOUNTPOINT/MD5SUM
>>
>> for N in $NEW_BLOCKS $((NEW_BLOCKS + 50)); do
>>         $RESIZE2FS "/dev/$VG/$LV" "$N"
>>
>>         umount "$MOUNTPOINT"
>>         fsck.ext4 -nvf "/dev/$VG/$LV"
>>         mount "/dev/$VG/$LV" "$MOUNTPOINT"
>>         md5sum -c "$MOUNTPOINT/MD5SUM"
>> done
>>
>>
>>
>>> So while I very much appreciate users giving the code a try and
>>> sending us feedback, please do think twice before using this code on
>>> file systems with data that hasn't been backed up recently.  (Of
>>> course, being good System Administrators you are all keeping --- and
>>> verifying --- regular backups, right?  :-)
>>
>> --
>> Anssi Hannula
> 
> 
>
Yongqiang Yang Sept. 6, 2012, 4:19 p.m. UTC | #14
Thanks for your testing.

Yongqiang.

On Thu, Sep 6, 2012 at 10:22 PM, Anssi Hannula <anssi.hannula@iki.fi> wrote:
> 05.09.2012 05:10, Yongqiang Yang kirjoitti:
>> Hi Anssi,
>
> Hi,
>
>> The bug was fixed for a while, please check the patches:
>> [PATCH 1/2] ext4: teach resize report old blocks count correctly
>> [PATCH 2/2] ext4: ignore last group without enough space when resizing
>>
>> Please have a try!!!
>
> Confirmed that with these patches the simple test passes :)
>
>
>> Thanks,
>> Yongqiang.
>>
>> On Wed, Sep 5, 2012 at 1:05 AM, Anssi Hannula <anssi.hannula@iki.fi> wrote:
>>> 04.09.2012 05:14, Theodore Ts'o kirjoitti:
>>>> On Tue, Sep 04, 2012 at 09:59:55AM +0800, Yongqiang Yang wrote:
>>>>> Hi Kevin,
>>>>>
>>>>>   Ted has sent out the patches on online resizing for meta_bg and
>>>>> 64bits, so you can have a try again. It seems that the bug in
>>>>> e2fsprogs has been fixed.
>>>>
>>>> Make sure you use the latest version of the kernel patches that I just
>>>> sent out.  There quite a number of bugs in the Yongqiang's original
>>>> patch set which I tripped over while I was testing 64-bit resize ---
>>>> and please note that there are definitely still rough edges
>>>> (especially for in cases where the file system was created < 16TB, but
>>>> with the 64-bit feature and resize_inode features enabled).  There may
>>>> also be bugs for the straightforward case of resizing very large file
>>>> systems.
>>>
>>> Indeed, I hit a BUG_ON() on resize from 8589934590 blocks to 8589934640
>>> blocks (4k):
>>>  [  676.140165] ------------[ cut here ]------------
>>>  [  676.150026] kernel BUG at fs/ext4/resize.c:255!
>>>  [  676.150026] invalid opcode: 0000 [#1] SMP
>>>  [  676.150026] CPU 0
>>>  [  676.150026] Modules linked in:[  676.150026]  dm_snapshot dm_zero
>>> af_packet dm_mod joydev hid_generic ppdev snd_intel8x0 snd_ac97_codec
>>> ac97_bus usbhid microcode e1000 snd_pcm snd_page_alloc snd_timer hid
>>> i2c_piix4 i2c_core button snd soundcore ac parport_pc parport processor
>>> evdev ipv6 autofs4 ext4 crc16 jbd2 ohci_hcd sd_mod crc_t10dif usbcore
>>> usb_common sr_mod ata_piix ahci libahci libata scsi_mod [last unloaded:
>>> nf_defrag_ipv4]
>>>
>>>  [  676.150026] Pid: 1793, comm: resize2fs Not tainted
>>> 3.5.3-server-2anssi.9.ext4.10.2 #1 innotek GmbH VirtualBox
>>>  [  676.150026] RIP: 0010:[<ffffffffa014e9bc>]  [<ffffffffa014e9bc>]
>>> ext4_resize_fs+0x94c/0xa30 [ext4]
>>>  [  676.150026] RSP: 0018:ffff880046eedd18  EFLAGS: 00010246
>>>  [  676.150026] RAX: 0000000000040001 RBX: ffff88005b799800 RCX:
>>> 0000000000000001
>>>  [  676.150026] RDX: 0000000000081bf1 RSI: 0000000000040001 RDI:
>>> ffff88005b068000
>>>  [  676.150026] RBP: ffff880046eeddd8 R08: 0000000200000003 R09:
>>> 0000000000000000
>>>  [  676.150026] R10: 0000000000000000 R11: 0000000080042000 R12:
>>> 0000000000040001
>>>  [  676.150026] R13: ffff880037fb5e20 R14: 0000000000000000 R15:
>>> ffff88005b068000
>>>  [  676.150026] FS:  00007fb43e067740(0000) GS:ffff88005fc00000(0000)
>>> knlGS:0000000000000000
>>>  [  676.150026] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
>>>  [  676.150026] CR2: 00007fd905261178 CR3: 0000000044993000 CR4:
>>> 00000000000006f0
>>>  [  676.150026] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
>>> 0000000000000000
>>>  [  676.150026] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
>>> 0000000000000400
>>>  [  676.150026] Process resize2fs (pid: 1793, threadinfo
>>> ffff880046eec000, task ffff880053f824c0)
>>>  [  676.150026] Stack:
>>>  [  676.150026]  ffff880046eedda8 ffffffff8117971e 0000000f53529c40
>>> 0000000000000000
>>>  [  676.150026]  000000020000002f 0000000000000000 0000000200000030
>>> ffff88005b20e990
>>>  [  676.150026]  0000000100000001 ffff880000000001 0000000200000000
>>> 0000000200000000
>>>  [  676.150026] Call Trace:
>>>  [  676.150026]  [<ffffffff8117971e>] ? do_last+0x2ee/0x9f0
>>>  [  676.150026]  [<ffffffffa012e05f>] ext4_ioctl+0x9af/0xbc0 [ext4]
>>>  [  676.150026]  [<ffffffff8117db6f>] do_vfs_ioctl+0x8f/0x4e0
>>>  [  676.150026]  [<ffffffff8117e051>] sys_ioctl+0x91/0xa0
>>>  [  676.150026]  [<ffffffff8147d0bd>] system_call_fastpath+0x1a/0x1f
>>>  [  676.150026] Code: c7 c1 60 2c 17 a0 ba 0c 07 00 00 48 c7 c6 9b e8 16
>>> a0 4c 89 e7 31 c0 e8 b3 80 ff ff c7 85 6c ff ff ff ea ff ff ff e9 4c f8
>>> ff ff <0f> 0b 8b 55 a4 8b 45 a0 f7 da 44 31 e0 85 c2 0f 84 6c fb ff ff
>>>  [  676.150026] RIP  [<ffffffffa014e9bc>] ext4_resize_fs+0x94c/0xa30 [ext4]
>>>  [  676.150026]  RSP <ffff880046eedd18>
>>>  [  676.788513] ---[ end trace fbf2bd5a59c2ab99 ]---
>>>
>>> This is BUG_ON(src_group >= group_data[0].group + flex_gd->count);
>>>
>>> I was using the below basic test script which uses a virtual large
>>> volume in LVM (e2fsprogs is 1.42.5, except for resize2fs):
>>>
>>> #!/bin/bash -ex
>>>
>>> VG=delta
>>> LV=ext4test
>>> LVSIZE=40T
>>> MOUNTPOINT="/mnt/iso"
>>> RESIZE2FS=/root/resize2fs
>>>
>>> INITIAL_SIZE_K=4294967295
>>> NEW_BLOCKS=8589934590
>>>
>>> lvcreate -l 100%FREE -V "$LVSIZE" -n "$LV" "$VG"
>>> mkfs.ext4 -O meta_bg,64bit,^resize_inode "/dev/$VG/$LV" "$INITIAL_SIZE_K"
>>>
>>> mount "/dev/$VG/$LV" "$MOUNTPOINT"
>>>
>>> mkdir "$MOUNTPOINT/test"
>>> for file in 1 2; do
>>>         dd if=/dev/urandom bs=1M count=50 of="$MOUNTPOINT/test/$file"
>>> done
>>> md5sum $MOUNTPOINT/test/* > $MOUNTPOINT/MD5SUM
>>>
>>> for N in $NEW_BLOCKS $((NEW_BLOCKS + 50)); do
>>>         $RESIZE2FS "/dev/$VG/$LV" "$N"
>>>
>>>         umount "$MOUNTPOINT"
>>>         fsck.ext4 -nvf "/dev/$VG/$LV"
>>>         mount "/dev/$VG/$LV" "$MOUNTPOINT"
>>>         md5sum -c "$MOUNTPOINT/MD5SUM"
>>> done
>>>
>>>
>>>
>>>> So while I very much appreciate users giving the code a try and
>>>> sending us feedback, please do think twice before using this code on
>>>> file systems with data that hasn't been backed up recently.  (Of
>>>> course, being good System Administrators you are all keeping --- and
>>>> verifying --- regular backups, right?  :-)
>>>
>>> --
>>> Anssi Hannula
>>
>>
>>
>
>
> --
> Anssi Hannula
Theodore Ts'o Sept. 13, 2012, 11:21 p.m. UTC | #15
On Wed, Sep 05, 2012 at 02:32:32PM +0800, Kevin Liao wrote:
> 
> I had done some simple and quick test. The following is the result.
> 
> For 20TB with 64bit,meta_bg,^resize_inode
> mke2fs: 1m25.090s
> mount: 19.992s
> e2fsck: 2m55.048s
> 
> For 20TB without 64bit,meta_bg,^resize_inode
> mke2fs: 1m3.660s
> mount: 1.458s
> e2fsck: 1m56.055s

The reason for this is how meta_bg changes how the block group
descriptors are laid out.  Originally, the block group descriptors
were located contiguously.  From a 12T filesystem without meta_bg,
you'll see this from dumpe2fs:

Group 0: (Blocks 0-32767)
  Primary superblock at 0, Group descriptors at 1-768

If the file system is created with meta_bg, then group descriptors
that have to be read when the file system is opened by libext2fs or
when the file system is mounted look like this:

Group 0: (Blocks 0-32767)
  Primary superblock at 0, Group descriptor at 1
Group 128: (Blocks 4194304-4227071) [INODE_UNINIT]
  Group descriptor at 4194304
Group 256: (Blocks 8388608-8421375) [INODE_UNINIT]
  Group descriptor at 8388608
Group 384: (Blocks 12582912-12615679) [INODE_UNINIT]
  Group descriptor at 12582912
  ...

In the set of kernel and e2fsprogs patches that I just released, we
can partially work around this problem by starting with the
resize_inode, and only switch over to the meta_bg once we have
exhausted the resize_inode scheme.  So now we can do this:

    mke2fs -t ext4 -q -O 64bit /dev/vdc 12T
    mount /dev/vdc
    resize2fs /dev/vdc 18T

After the resize2fs, the block group descriptors for the first 16TB
will be contiguous:

Group 0: (Blocks 0-32767) [ITABLE_ZEROED]
  Primary superblock at 0, Group descriptors at 1-2048

after that, there will be singleton block group descriptor blocks, i.e.:

Group 131136: (Blocks 4297064448-4297097215) [INODE_UNINIT]
  Group descriptor at 4297064448

The other thing we can do to speed up the mount times is change how
the kernel to lazily read the block group descriptors, instead of
trying to read them all at mount time, at least once they are no
longer contiguous.  I'll look into seeing what we can do to improve
things on that front.

Regards,

					- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kevin Liao Sept. 14, 2012, 3:24 a.m. UTC | #16
2012/9/14 Theodore Ts'o <tytso@mit.edu>:
>
> The reason for this is how meta_bg changes how the block group
> descriptors are laid out.  Originally, the block group descriptors
> were located contiguously.  From a 12T filesystem without meta_bg,
> you'll see this from dumpe2fs:
>
> Group 0: (Blocks 0-32767)
>   Primary superblock at 0, Group descriptors at 1-768
>
> If the file system is created with meta_bg, then group descriptors
> that have to be read when the file system is opened by libext2fs or
> when the file system is mounted look like this:
>
> Group 0: (Blocks 0-32767)
>   Primary superblock at 0, Group descriptor at 1
> Group 128: (Blocks 4194304-4227071) [INODE_UNINIT]
>   Group descriptor at 4194304
> Group 256: (Blocks 8388608-8421375) [INODE_UNINIT]
>   Group descriptor at 8388608
> Group 384: (Blocks 12582912-12615679) [INODE_UNINIT]
>   Group descriptor at 12582912
>   ...
>
> In the set of kernel and e2fsprogs patches that I just released, we
> can partially work around this problem by starting with the
> resize_inode, and only switch over to the meta_bg once we have
> exhausted the resize_inode scheme.  So now we can do this:
>
>     mke2fs -t ext4 -q -O 64bit /dev/vdc 12T
>     mount /dev/vdc
>     resize2fs /dev/vdc 18T
>
> After the resize2fs, the block group descriptors for the first 16TB
> will be contiguous:
>
> Group 0: (Blocks 0-32767) [ITABLE_ZEROED]
>   Primary superblock at 0, Group descriptors at 1-2048
>
> after that, there will be singleton block group descriptor blocks, i.e.:
>
> Group 131136: (Blocks 4297064448-4297097215) [INODE_UNINIT]
>   Group descriptor at 4297064448
>
> The other thing we can do to speed up the mount times is change how
> the kernel to lazily read the block group descriptors, instead of
> trying to read them all at mount time, at least once they are no
> longer contiguous.  I'll look into seeing what we can do to improve
> things on that front.
>
> Regards,
>
>                                         - Ted

Ted, thanks a lot for the detailed explanation. It is very clear. I
will find time to test your new patches..

Regards,
Kevin Liao
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/resize/resize2fs.c b/resize/resize2fs.c
index dc2805d..1dce498 100644
--- a/resize/resize2fs.c
+++ b/resize/resize2fs.c
@@ -1890,6 +1890,8 @@  blk64_t calculate_minimum_resize_size(ext2_filsys fs)
 	blk64_t grp, data_needed, last_start;
 	blk64_t overhead = 0;
 	int num_of_superblocks = 0;
+	blk64_t super_overhead = 0;
+	int old_desc_blocks;
 	int extra_groups = 0;
 	int flexbg_size = 1 << fs->super->s_log_groups_per_flex;
 
@@ -1909,15 +1911,36 @@  blk64_t calculate_minimum_resize_size(ext2_filsys fs)
 	 * we need to figure out how many backup superblocks we have so we can
 	 * account for that in the metadata
 	 */
+	if (fs->super->s_feature_incompat & EXT2_FEATURE_INCOMPAT_META_BG)
+		old_desc_blocks = fs->super->s_first_meta_bg;
+	else
+		old_desc_blocks = fs->desc_blocks +
+			fs->super->s_reserved_gdt_blocks;
+
 	for (grp = 0; grp < fs->group_desc_count; grp++) {
+		blk64_t	super_blk, old_desc_blk, new_desc_blk;
+		int has_super;
+
+		ext2fs_super_and_bgd_loc2(fs, grp, &super_blk,
+					  &old_desc_blk, &new_desc_blk, 0);
+		has_super = ((grp == 0) || super_blk);
+		if (has_super)
+			super_overhead++;
+		if (old_desc_blk)
+			super_overhead += old_desc_blocks;
+		else if (new_desc_blk)
+			super_overhead++;
 		if (ext2fs_bg_has_super(fs, grp))
 			num_of_superblocks++;
+
 	}
+	printf("super overhead is %llu, old algorithm was %llu\n",
+	       super_overhead, SUPER_OVERHEAD(fs) * num_of_superblocks);
 
 	/* calculate how many blocks are needed for data */
 	data_needed = ext2fs_blocks_count(fs->super) -
 		ext2fs_free_blocks_count(fs->super);
-	data_needed -= SUPER_OVERHEAD(fs) * num_of_superblocks;
+	data_needed -= super_overhead;
 	data_needed -= META_OVERHEAD(fs) * fs->group_desc_count;
 
 	if (fs->super->s_feature_incompat & EXT4_FEATURE_INCOMPAT_FLEX_BG) {