Message ID | 20120625191744.GB9688@thunk.org |
---|---|
State | Accepted, archived |
Headers | show |
On 06/25/2012 12:17 PM, Theodore Ts'o wrote: > On Mon, Jun 25, 2012 at 04:51:59PM +0800, Zheng Liu wrote: >> >> Actually I want to send a url for you from linux mailing list archive but >> I cannot find it. After applying this patch, you can call ioctl(2) to >> enable expose_stale_data flag, and then when you call fallocate(2), ext4 >> create initialized extents for you. This patch cannot be merged into >> upstream kernel because it brings a huge security hole. > > This is what we're using internally inside Google.... this allows the > security exposure to be restricted to those programs running with a > specific group id (which is better than giving programs access to > CAP_SYS_RAWIO). We also require the use of a specific fallocate flag > so that programs have to explicitly ask for this feature. > > Also note that I restrict the combination of NO_HIDE_STALE && > KEEP_SIZE since it causes e2fsck to complain --- and if you're trying > to avoid fs metadata I/O, you want to avoid the extra i_size update > anyway, so it's not worth trying to make this work w/o causing e2fsck > complaints. > > This patch is versus the v3.3 kernel (as it happens, I was just in the > middle of rebasing this patch from 2.6.34 :-) > > - Ted > > P.S. It just occurred to me that there are some patches being > discussed that assign new fallocate flags for volatile data handling. > So it would probably be a good idea to move the fallocate flag > codepoint assignment up out of the way to avoid future conflicts. > > commit 5f12f1bc2b0fb0866d52763a611b022780780f05 > Author: Theodore Ts'o <tytso@google.com> > Date: Fri Jun 22 17:19:53 2012 -0400 > > ext4: add an fallocate flag to mark newly allocated extents initialized > > This commit adds a new flag to ext4's fallocate that allows new, > uninitialized extents to be marked as initialized. This flag, > FALLOC_FL_NO_HIDE_STALE requires that the nohide_stale_gid=<gid> mount > option be used when the file system is mounted, and that the user is > in the group <gid>. > > The benefit is to a program fallocates a larger space, but then writes > to that space in small increments. This option prevents ext4 from > having to split the unallocated extent and merge the newly initialized > extent with the extent to its left. Even though this usually happens > in-memory, this option is useful for tight memory situations and for > ext4 on flash. Note: This allows an application in ths hohide_stale > group to see stale data on the filesystem. > > Tested: Updated xfstests g002 to test a case where > fallocate:no-hide-stale is not allowed. The existing tests now pass > because I added a remount with a group that user root is in. > Rebase-Tested-v3.3: same > > Effort: fs/nohide-stale > Origin-2.6.34-SHA1: c3099bf61be1baf94bc91c481995bb0d77f05786 > Origin-2.6.34-SHA1: 004dd33b9ebc5d860781c3435526658cc8aa8ccb > Change-Id: I0d2a7f2a4cf34443269acbcedb7b7074e0055e69 > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index aaaece6..ac7aa42 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -1240,6 +1240,9 @@ struct ext4_sb_info { > unsigned long s_mb_last_group; > unsigned long s_mb_last_start; > > + /* gid that's allowed to see stale data via falloc flag. */ > + gid_t no_hide_stale_gid; > + > /* stats for buddy allocator */ > atomic_t s_bal_reqs; /* number of reqs with len > 1 */ > atomic_t s_bal_success; /* we found long enough chunks */ > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c > index cb99346..cc57c85 100644 > --- a/fs/ext4/extents.c > +++ b/fs/ext4/extents.c > @@ -4375,6 +4375,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) > int retries = 0; > int flags; > struct ext4_map_blocks map; > + struct ext4_sb_info *sbi; > unsigned int credits, blkbits = inode->i_blkbits; > > /* > @@ -4385,12 +4386,28 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) > return -EOPNOTSUPP; > > /* Return error if mode is not supported */ > - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) > + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | > + FALLOC_FL_NO_HIDE_STALE)) > + return -EOPNOTSUPP; > + > + /* The combination of NO_HIDE_STALE and KEEP_SIZE is not supported */ > + if ((mode & FALLOC_FL_NO_HIDE_STALE) && > + (mode & FALLOC_FL_KEEP_SIZE)) > return -EOPNOTSUPP; > > if (mode & FALLOC_FL_PUNCH_HOLE) > return ext4_punch_hole(file, offset, len); > > + sbi = EXT4_SB(inode->i_sb); > + /* Must have RAWIO to see stale data. */ > + if ((mode & FALLOC_FL_NO_HIDE_STALE) && > + !in_egroup_p(sbi->no_hide_stale_gid)) > + return -EACCES; > + > + /* preallocation to directories is currently not supported */ > + if (S_ISDIR(inode->i_mode)) > + return -ENODEV; > + > trace_ext4_fallocate_enter(inode, offset, len, mode); > map.m_lblk = offset >> blkbits; > /* > @@ -4429,6 +4446,8 @@ retry: > ret = PTR_ERR(handle); > break; > } > + if (mode & FALLOC_FL_NO_HIDE_STALE) > + flags &= ~EXT4_GET_BLOCKS_UNINIT_EXT; > ret = ext4_map_blocks(handle, inode, &map, flags); > if (ret <= 0) { > #ifdef EXT4FS_DEBUG > diff --git a/fs/ext4/super.c b/fs/ext4/super.c > index 5b443a8..d976ec1 100644 > --- a/fs/ext4/super.c > +++ b/fs/ext4/super.c > @@ -1175,6 +1175,8 @@ static int ext4_show_options(struct seq_file *seq, struct dentry *root) > if (test_opt2(sb, BIG_EXT)) > seq_puts(seq, ",big_extent"); > #endif > + if (sbi->no_hide_stale_gid != -1) > + seq_printf(seq, ",nohide_stale_gid=%u", sbi->no_hide_stale_gid); > > ext4_show_quota_options(seq, sb); > > @@ -1353,6 +1355,7 @@ enum { > #ifdef CONFIG_EXT4_BIG_EXTENT > Opt_big_extent, Opt_nobig_extent, > #endif > + Opt_nohide_stale_gid, > }; > > static const match_table_t tokens = { > @@ -1432,6 +1435,7 @@ static const match_table_t tokens = { > {Opt_big_extent, "big_extent"}, > {Opt_nobig_extent, "nobig_extent"}, > #endif > + {Opt_nohide_stale_gid, "nohide_stale_gid=%u"}, > {Opt_err, NULL}, > }; > > @@ -1931,6 +1935,12 @@ set_qf_format: > return 0; > sbi->s_li_wait_mult = option; > break; > + case Opt_nohide_stale_gid: > + if (match_int(&args[0], &option)) > + return 0; > + /* -1 for disabled, otherwise it's valid. */ > + sbi->no_hide_stale_gid = option; > + break; > case Opt_noinit_itable: > clear_opt(sb, INIT_INODE_TABLE); > break; > @@ -3274,6 +3284,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) > #ifdef CONFIG_EXT4_BIG_EXTENT > sbi->s_min_big_ext_size = EXT4_DEFAULT_MIN_BIG_EXT_SIZE; > #endif > + /* Default to having no-hide-stale disabled. */ > + sbi->no_hide_stale_gid = -1; > > if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) > set_opt(sb, BARRIER); > diff --git a/fs/open.c b/fs/open.c > index 201431a..4edc0cd 100644 > --- a/fs/open.c > +++ b/fs/open.c > @@ -224,7 +224,9 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) > return -EINVAL; > > /* Return error if mode is not supported */ > - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) > + if (mode & ~(FALLOC_FL_KEEP_SIZE | > + FALLOC_FL_PUNCH_HOLE | > + FALLOC_FL_NO_HIDE_STALE)) > return -EOPNOTSUPP; > > /* Punch hole must have keep size set */ > diff --git a/include/linux/falloc.h b/include/linux/falloc.h > index 73e0b62..a2489ac 100644 > --- a/include/linux/falloc.h > +++ b/include/linux/falloc.h > @@ -3,6 +3,7 @@ > > #define FALLOC_FL_KEEP_SIZE 0x01 /* default is extend size */ > #define FALLOC_FL_PUNCH_HOLE 0x02 /* de-allocates range */ > +#define FALLOC_FL_NO_HIDE_STALE 0x04 /* default is hide stale data */ > > #ifdef __KERNEL__ > > Thanks Ted. This patch is very nice and addresses the comments of Andreas of using a mount option. -Fredrick -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/25/2012 03:17 PM, Theodore Ts'o wrote: > On Mon, Jun 25, 2012 at 04:51:59PM +0800, Zheng Liu wrote: >> Actually I want to send a url for you from linux mailing list archive but >> I cannot find it. After applying this patch, you can call ioctl(2) to >> enable expose_stale_data flag, and then when you call fallocate(2), ext4 >> create initialized extents for you. This patch cannot be merged into >> upstream kernel because it brings a huge security hole. > This is what we're using internally inside Google.... this allows the > security exposure to be restricted to those programs running with a > specific group id (which is better than giving programs access to > CAP_SYS_RAWIO). We also require the use of a specific fallocate flag > so that programs have to explicitly ask for this feature. > > Also note that I restrict the combination of NO_HIDE_STALE && > KEEP_SIZE since it causes e2fsck to complain --- and if you're trying > to avoid fs metadata I/O, you want to avoid the extra i_size update > anyway, so it's not worth trying to make this work w/o causing e2fsck > complaints. > > This patch is versus the v3.3 kernel (as it happens, I was just in the > middle of rebasing this patch from 2.6.34 :-) > > - Ted Hi Ted, Has anyone made progress digging into the performance impact of running without this patch? We should definitely see if there is some low hanging fruit there, especially given that XFS does not seem to suffer such a huge hit. I think that we need to get a good reproducer for the workload that causes the pain and start to dig into this. Opening this security exposure is still something that is clearly a hack and best avoided if we can fix the root cause :) Ric > > P.S. It just occurred to me that there are some patches being > discussed that assign new fallocate flags for volatile data handling. > So it would probably be a good idea to move the fallocate flag > codepoint assignment up out of the way to avoid future conflicts. > > commit 5f12f1bc2b0fb0866d52763a611b022780780f05 > Author: Theodore Ts'o <tytso@google.com> > Date: Fri Jun 22 17:19:53 2012 -0400 > > ext4: add an fallocate flag to mark newly allocated extents initialized > > This commit adds a new flag to ext4's fallocate that allows new, > uninitialized extents to be marked as initialized. This flag, > FALLOC_FL_NO_HIDE_STALE requires that the nohide_stale_gid=<gid> mount > option be used when the file system is mounted, and that the user is > in the group <gid>. > > The benefit is to a program fallocates a larger space, but then writes > to that space in small increments. This option prevents ext4 from > having to split the unallocated extent and merge the newly initialized > extent with the extent to its left. Even though this usually happens > in-memory, this option is useful for tight memory situations and for > ext4 on flash. Note: This allows an application in ths hohide_stale > group to see stale data on the filesystem. > > Tested: Updated xfstests g002 to test a case where > fallocate:no-hide-stale is not allowed. The existing tests now pass > because I added a remount with a group that user root is in. > Rebase-Tested-v3.3: same > > Effort: fs/nohide-stale > Origin-2.6.34-SHA1: c3099bf61be1baf94bc91c481995bb0d77f05786 > Origin-2.6.34-SHA1: 004dd33b9ebc5d860781c3435526658cc8aa8ccb > Change-Id: I0d2a7f2a4cf34443269acbcedb7b7074e0055e69 > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index aaaece6..ac7aa42 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -1240,6 +1240,9 @@ struct ext4_sb_info { > unsigned long s_mb_last_group; > unsigned long s_mb_last_start; > > + /* gid that's allowed to see stale data via falloc flag. */ > + gid_t no_hide_stale_gid; > + > /* stats for buddy allocator */ > atomic_t s_bal_reqs; /* number of reqs with len > 1 */ > atomic_t s_bal_success; /* we found long enough chunks */ > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c > index cb99346..cc57c85 100644 > --- a/fs/ext4/extents.c > +++ b/fs/ext4/extents.c > @@ -4375,6 +4375,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) > int retries = 0; > int flags; > struct ext4_map_blocks map; > + struct ext4_sb_info *sbi; > unsigned int credits, blkbits = inode->i_blkbits; > > /* > @@ -4385,12 +4386,28 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) > return -EOPNOTSUPP; > > /* Return error if mode is not supported */ > - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) > + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | > + FALLOC_FL_NO_HIDE_STALE)) > + return -EOPNOTSUPP; > + > + /* The combination of NO_HIDE_STALE and KEEP_SIZE is not supported */ > + if ((mode & FALLOC_FL_NO_HIDE_STALE) && > + (mode & FALLOC_FL_KEEP_SIZE)) > return -EOPNOTSUPP; > > if (mode & FALLOC_FL_PUNCH_HOLE) > return ext4_punch_hole(file, offset, len); > > + sbi = EXT4_SB(inode->i_sb); > + /* Must have RAWIO to see stale data. */ > + if ((mode & FALLOC_FL_NO_HIDE_STALE) && > + !in_egroup_p(sbi->no_hide_stale_gid)) > + return -EACCES; > + > + /* preallocation to directories is currently not supported */ > + if (S_ISDIR(inode->i_mode)) > + return -ENODEV; > + > trace_ext4_fallocate_enter(inode, offset, len, mode); > map.m_lblk = offset >> blkbits; > /* > @@ -4429,6 +4446,8 @@ retry: > ret = PTR_ERR(handle); > break; > } > + if (mode & FALLOC_FL_NO_HIDE_STALE) > + flags &= ~EXT4_GET_BLOCKS_UNINIT_EXT; > ret = ext4_map_blocks(handle, inode, &map, flags); > if (ret <= 0) { > #ifdef EXT4FS_DEBUG > diff --git a/fs/ext4/super.c b/fs/ext4/super.c > index 5b443a8..d976ec1 100644 > --- a/fs/ext4/super.c > +++ b/fs/ext4/super.c > @@ -1175,6 +1175,8 @@ static int ext4_show_options(struct seq_file *seq, struct dentry *root) > if (test_opt2(sb, BIG_EXT)) > seq_puts(seq, ",big_extent"); > #endif > + if (sbi->no_hide_stale_gid != -1) > + seq_printf(seq, ",nohide_stale_gid=%u", sbi->no_hide_stale_gid); > > ext4_show_quota_options(seq, sb); > > @@ -1353,6 +1355,7 @@ enum { > #ifdef CONFIG_EXT4_BIG_EXTENT > Opt_big_extent, Opt_nobig_extent, > #endif > + Opt_nohide_stale_gid, > }; > > static const match_table_t tokens = { > @@ -1432,6 +1435,7 @@ static const match_table_t tokens = { > {Opt_big_extent, "big_extent"}, > {Opt_nobig_extent, "nobig_extent"}, > #endif > + {Opt_nohide_stale_gid, "nohide_stale_gid=%u"}, > {Opt_err, NULL}, > }; > > @@ -1931,6 +1935,12 @@ set_qf_format: > return 0; > sbi->s_li_wait_mult = option; > break; > + case Opt_nohide_stale_gid: > + if (match_int(&args[0], &option)) > + return 0; > + /* -1 for disabled, otherwise it's valid. */ > + sbi->no_hide_stale_gid = option; > + break; > case Opt_noinit_itable: > clear_opt(sb, INIT_INODE_TABLE); > break; > @@ -3274,6 +3284,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) > #ifdef CONFIG_EXT4_BIG_EXTENT > sbi->s_min_big_ext_size = EXT4_DEFAULT_MIN_BIG_EXT_SIZE; > #endif > + /* Default to having no-hide-stale disabled. */ > + sbi->no_hide_stale_gid = -1; > > if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) > set_opt(sb, BARRIER); > diff --git a/fs/open.c b/fs/open.c > index 201431a..4edc0cd 100644 > --- a/fs/open.c > +++ b/fs/open.c > @@ -224,7 +224,9 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) > return -EINVAL; > > /* Return error if mode is not supported */ > - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) > + if (mode & ~(FALLOC_FL_KEEP_SIZE | > + FALLOC_FL_PUNCH_HOLE | > + FALLOC_FL_NO_HIDE_STALE)) > return -EOPNOTSUPP; > > /* Punch hole must have keep size set */ > diff --git a/include/linux/falloc.h b/include/linux/falloc.h > index 73e0b62..a2489ac 100644 > --- a/include/linux/falloc.h > +++ b/include/linux/falloc.h > @@ -3,6 +3,7 @@ > > #define FALLOC_FL_KEEP_SIZE 0x01 /* default is extend size */ > #define FALLOC_FL_PUNCH_HOLE 0x02 /* de-allocates range */ > +#define FALLOC_FL_NO_HIDE_STALE 0x04 /* default is hide stale data */ > > #ifdef __KERNEL__ > > -- > To unsubscribe from this list: send the line "unsubscribe linux-ext4" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jun 26, 2012 at 09:13:35AM -0400, Ric Wheeler wrote: > > Has anyone made progress digging into the performance impact of > running without this patch? We should definitely see if there is > some low hanging fruit there, especially given that XFS does not > seem to suffer such a huge hit. I just haven't had time, sorry. It's so much easier to run with the patch. :-) Part of the problem certainly caused by the fact that ext4 is using physical block journaling instead of logical journalling. But we see the problem in no-journal mode as well. I think part of the problem is simply that many of the workloads where people are doing this, they also care about robustness after power failures, and if you are doing random writes into uninitialized space, with fsyncs in-between, you are basically guaranteed a 2x expansion in the number of writes you need to do to the system. One other thing which we *have* seen is that we need to do a better job with extent merging; if you run without this patch, and you run with fio in AIO mode where you are doing tons and tons of random writes into uninitialized space, you can end up fragmenting the extent tree very badly. So fixing this would certainly help. > Opening this security exposure is still something that is clearly a > hack and best avoided if we can fix the root cause :) See Linus's recent rant about how security arguments made by theoreticians very often end up getting trumped by practical matters. If you are running a daemon, whether it is a user-mode cluster file system, or a database server, where it is (a) fundamentally trusted, and (b) doing its own user-space checksuming and its own guarantees to never return uninitialized data, even if we fix all potential problems, we *still* can be reducing the number of random writes --- and on a fully loaded system, we're guaranteed to be seek-constrained, so each random write to update fs metadata means that you're burning 0.5% of your 200 seeks/second on your 3TB disk (where previously you had half a dozen 500gig disks each with 200 seeks/second). I agree with you that it would be nice to look into this further, and optimizing our extent merging is definitely on the hot list of perofrmance improvements to look at. But people who are using ext4 as back-end database servers or cluster file system servers and who are interested in wringing out every last percentage of performace are going to be interested in this technique, no matter what we do. If you have Sagans and Sagans of servers all over the world, even a tenth of a percentage point performance improvement can easily translate into big dollars. - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> Hi Ted, > > Has anyone made progress digging into the performance impact of running > without this patch? We should definitely see if there is some low > hanging fruit there, especially given that XFS does not seem to suffer > such a huge hit. > > I think that we need to get a good reproducer for the workload that > causes the pain and start to dig into this. > > Opening this security exposure is still something that is clearly a hack > and best avoided if we can fix the root cause :) > > Ric > >> Hi Ric, I had run perf stat on ext4 functions between two runs of our program writing data to a file for the first time and writing data to the file for the second time(where the extents are initialized). The amount of data written is same between the two runs. left is first time right is second time. < 42 ext4:ext4_mb_bitmap_load < 42 ext4:ext4_mb_buddy_bitmap_load < 642 ext4:ext4_mb_new_inode_pa < 645 ext4:ext4_mballoc_alloc < 9,596 ext4:ext4_mballoc_prealloc < 10,240 ext4:ext4_da_update_reserve_space --- > 7,413 ext4:ext4_mark_inode_dirty 49d52 < 10,241 ext4:ext4_allocate_blocks 51d53 < 10,241 ext4:ext4_request_blocks 55d56 < 1,310,720 ext4:ext4_da_reserve_space 58,60c59,60 < 1,331,288 ext4:ext4_ext_map_blocks_enter < 1,331,288 ext4:ext4_ext_map_blocks_exit < 1,341,467 ext4:ext4_mark_inode_dirty --- > 1,310,806 ext4:ext4_ext_map_blocks_enter > 1,310,806 ext4:ext4_ext_map_blocks_exit May be the mballocs have overhead. I ll try to compare numbers on XFS during this week. -Fredrick -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/26/2012 10:30 AM, Theodore Ts'o wrote: > On Tue, Jun 26, 2012 at 09:13:35AM -0400, Ric Wheeler wrote: >> >> Has anyone made progress digging into the performance impact of >> running without this patch? We should definitely see if there is >> some low hanging fruit there, especially given that XFS does not >> seem to suffer such a huge hit. > > I just haven't had time, sorry. It's so much easier to run with the > patch. :-) > > Part of the problem certainly caused by the fact that ext4 is using > physical block journaling instead of logical journalling. But we see > the problem in no-journal mode as well. I think part of the problem > is simply that many of the workloads where people are doing this, they > also care about robustness after power failures, and if you are doing > random writes into uninitialized space, with fsyncs in-between, you > are basically guaranteed a 2x expansion in the number of writes you > need to do to the system. > Even our workload is same as above. Our programs write a chunk and do fysnc for robustness. This happens repeatedly on the file as the program pushes more data on the disk. > One other thing which we *have* seen is that we need to do a better > job with extent merging; if you run without this patch, and you run > with fio in AIO mode where you are doing tons and tons of random > writes into uninitialized space, you can end up fragmenting the extent > tree very badly. So fixing this would certainly help. > >> Opening this security exposure is still something that is clearly a >> hack and best avoided if we can fix the root cause :) > > See Linus's recent rant about how security arguments made by > theoreticians very often end up getting trumped by practical matters. > If you are running a daemon, whether it is a user-mode cluster file > system, or a database server, where it is (a) fundamentally trusted, > and (b) doing its own user-space checksuming and its own guarantees to > never return uninitialized data, even if we fix all potential > problems, we *still* can be reducing the number of random writes --- > and on a fully loaded system, we're guaranteed to be seek-constrained, > so each random write to update fs metadata means that you're burning > 0.5% of your 200 seeks/second on your 3TB disk (where previously you > had half a dozen 500gig disks each with 200 seeks/second). > I can see the performance degradation on SSDs too, though the percentage is less compared to SATA. > I agree with you that it would be nice to look into this further, and > optimizing our extent merging is definitely on the hot list of > perofrmance improvements to look at. But people who are using ext4 as > back-end database servers or cluster file system servers and who are > interested in wringing out every last percentage of performace are > going to be interested in this technique, no matter what we do. If > you have Sagans and Sagans of servers all over the world, even a tenth > of a percentage point performance improvement can easily translate > into big dollars. > Sailing the same boat. :) > - Ted > -Fredrick -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/26/2012 01:30 PM, Theodore Ts'o wrote: > On Tue, Jun 26, 2012 at 09:13:35AM -0400, Ric Wheeler wrote: >> Has anyone made progress digging into the performance impact of >> running without this patch? We should definitely see if there is >> some low hanging fruit there, especially given that XFS does not >> seem to suffer such a huge hit. > I just haven't had time, sorry. It's so much easier to run with the > patch. :-) > > Part of the problem certainly caused by the fact that ext4 is using > physical block journaling instead of logical journalling. But we see > the problem in no-journal mode as well. I think part of the problem > is simply that many of the workloads where people are doing this, they > also care about robustness after power failures, and if you are doing > random writes into uninitialized space, with fsyncs in-between, you > are basically guaranteed a 2x expansion in the number of writes you > need to do to the system. It would be interesting to see if simply not doing the preallocation would be easier and safer. Better yet, figure out how to leverage trim commands to safely allow us to preallocate and not expose other users' data (and not have to mark the extents as allocated but not written). If we did that, we would have the performance boost without the security hole. > > One other thing which we *have* seen is that we need to do a better > job with extent merging; if you run without this patch, and you run > with fio in AIO mode where you are doing tons and tons of random > writes into uninitialized space, you can end up fragmenting the extent > tree very badly. So fixing this would certainly help. Definitely sounds like something worth fixing. > >> Opening this security exposure is still something that is clearly a >> hack and best avoided if we can fix the root cause :) > See Linus's recent rant about how security arguments made by > theoreticians very often end up getting trumped by practical matters. > If you are running a daemon, whether it is a user-mode cluster file > system, or a database server, where it is (a) fundamentally trusted, > and (b) doing its own user-space checksuming and its own guarantees to > never return uninitialized data, even if we fix all potential > problems, we *still* can be reducing the number of random writes --- > and on a fully loaded system, we're guaranteed to be seek-constrained, > so each random write to update fs metadata means that you're burning > 0.5% of your 200 seeks/second on your 3TB disk (where previously you > had half a dozen 500gig disks each with 200 seeks/second). This is not a theory guy worry. I would not use any server/web service that knowingly enabled this hack in a multi-user machine and would not enable it for any enterprise customers. This is a real world, hard promise that we will let other users see your data in a trivial way (with your patch, only if they have the capability set). > > I agree with you that it would be nice to look into this further, and > optimizing our extent merging is definitely on the hot list of > perofrmance improvements to look at. But people who are using ext4 as > back-end database servers or cluster file system servers and who are > interested in wringing out every last percentage of performace are > going to be interested in this technique, no matter what we do. If > you have Sagans and Sagans of servers all over the world, even a tenth > of a percentage point performance improvement can easily translate > into big dollars. > > - Ted We should be very, very careful not to strip away the usefulness of file system just to cater to some users. You could deliver the same performance safely in a few ways I think: * fully allocate the file (by actually writing the data once in large IO's) * don't preallocate and write with large enough IO's to populate the file (you can tweak this by doing something like allocation of largish chunks on first write to a region) * fix our preallocation to use discard (trim) primitives (note you can also use "WRITE_SAME" to init regions of blocks, but that can be quite slow) ric -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jun 26, 2012 at 02:21:24PM -0400, Ric Wheeler wrote: > > It would be interesting to see if simply not doing the preallocation > would be easier and safer. Better yet, figure out how to leverage > trim commands to safely allow us to preallocate and not expose other > users' data (and not have to mark the extents as allocated but not > written). TRIM is applicable to SSD's and enterprise storage arrays. It's not applicable HDD's. > This is not a theory guy worry. I would not use any server/web > service that knowingly enabled this hack in a multi-user machine and > would not enable it for any enterprise customers. Sure, but for single-user or for dedicated cluster file system server, it makes sense; it's not not going to be useful for all customers, sure, but for some customers it will make sense. > We should be very, very careful not to strip away the usefulness of > file system just to cater to some users. A mount option or a superblock flag does not "strip away the usefulness of the file system". It makes it more useful for some use cases. Your alternate proposals (preinitializing the space with zeros, using trim, writing larger chunks) will work for some use cases, certainly. But it's not going to be sufficient for at least some use cases, or they will simply not work. Fundamentally it sounds to me the biggest problem is that you don't trust your customers, and so you're afraid people will use the no-hide-stale feature if it becomes available, even if it's not needed or if it's needed but you're afraid that they don't understand the security tradeoffs. - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jun 26, 2012 at 11:05:40AM -0700, Fredrick wrote: > > I had run perf stat on ext4 functions between two runs of our program > writing data to a file for the first time and writing data to the file > for the second time(where the extents are initialized). From your mballoc differences, it sounds like you were comparing fallocate with not using fallocate at all; is that right? The comparison you need to do is using normal fallocate versus fallocate with the no-hide-stale feature enabled. It's obvious that allocating blocks as you need will always be more expensive than using fallocate. - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/26/2012 02:57 PM, Ted Ts'o wrote: > On Tue, Jun 26, 2012 at 02:21:24PM -0400, Ric Wheeler wrote: >> It would be interesting to see if simply not doing the preallocation >> would be easier and safer. Better yet, figure out how to leverage >> trim commands to safely allow us to preallocate and not expose other >> users' data (and not have to mark the extents as allocated but not >> written). > TRIM is applicable to SSD's and enterprise storage arrays. It's not > applicable HDD's. and device mapper can also support TRIM these days with the thinly provisioned lun targets (although this just moves the multiple IO issues down to device mapper :)) We would only use it when the device supports it of course. Also note that you can use WRITE_SAME on pretty much any drive (although again, it can be slow). > >> This is not a theory guy worry. I would not use any server/web >> service that knowingly enabled this hack in a multi-user machine and >> would not enable it for any enterprise customers. > Sure, but for single-user or for dedicated cluster file system server, > it makes sense; it's not not going to be useful for all customers, > sure, but for some customers it will make sense. The danger is that people use that google thing and see "ext4 performance improvement" and then turn it on without understanding what they bought. Believe me, I deal with that *a lot* in my day job :) > >> We should be very, very careful not to strip away the usefulness of >> file system just to cater to some users. > A mount option or a superblock flag does not "strip away the > usefulness of the file system". It makes it more useful for some use > cases. > > Your alternate proposals (preinitializing the space with zeros, using > trim, writing larger chunks) will work for some use cases, certainly. > But it's not going to be sufficient for at least some use cases, or > they will simply not work. I think that for the use case discussed in this thread it would probably work quite well, but always worth testing of course. > > Fundamentally it sounds to me the biggest problem is that you don't > trust your customers, and so you're afraid people will use the > no-hide-stale feature if it becomes available, even if it's not needed > or if it's needed but you're afraid that they don't understand the > security tradeoffs. > > - Ted No Ted, we don't trust exposing other customers data in order to cover up a poor design that other file systems don't suffer from. Ric -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/26/2012 02:05 PM, Fredrick wrote: > >> Hi Ted, >> >> Has anyone made progress digging into the performance impact of running >> without this patch? We should definitely see if there is some low >> hanging fruit there, especially given that XFS does not seem to suffer >> such a huge hit. >> >> I think that we need to get a good reproducer for the workload that >> causes the pain and start to dig into this. >> >> Opening this security exposure is still something that is clearly a hack >> and best avoided if we can fix the root cause :) >> >> Ric >> >>> > > Hi Ric, > > I had run perf stat on ext4 functions between two runs of our program > writing data to a file for the first time and writing data to the file > for the second time(where the extents are initialized). > The amount of data written is same between the two runs. > > left is first time > right is second time. > > > < 42 ext4:ext4_mb_bitmap_load > < 42 ext4:ext4_mb_buddy_bitmap_load > < 642 ext4:ext4_mb_new_inode_pa > < 645 ext4:ext4_mballoc_alloc > < 9,596 ext4:ext4_mballoc_prealloc > < 10,240 ext4:ext4_da_update_reserve_space > --- > > 7,413 ext4:ext4_mark_inode_dirty > 49d52 > < 10,241 ext4:ext4_allocate_blocks > 51d53 > < 10,241 ext4:ext4_request_blocks > 55d56 > < 1,310,720 ext4:ext4_da_reserve_space > 58,60c59,60 > < 1,331,288 ext4:ext4_ext_map_blocks_enter > < 1,331,288 ext4:ext4_ext_map_blocks_exit > < 1,341,467 ext4:ext4_mark_inode_dirty > --- > > 1,310,806 ext4:ext4_ext_map_blocks_enter > > 1,310,806 ext4:ext4_ext_map_blocks_exit > > > May be the mballocs have overhead. > > I ll try to compare numbers on XFS during this week. > > -Fredrick > Thanks! Eric is also running some tests to evaluate the impact of various techniques :) ric -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 6/26/12 3:30 PM, Ric Wheeler wrote: > Thanks! Eric is also running some tests to evaluate the impact of various techniques :) > > ric Yup forgive me for interjecting actual numbers into the discussion ;) I tried running this fio recipe on v3.3, which I think does a decent job of emulating the situation (fallocate 1G, do random 1M writes into it, with fsyncs after each): [test] filename=testfile rw=randwrite size=1g filesize=1g bs=1024k ioengine=sync fallocate=1 fsync=1 Stock ext4 (3 tests w/ file remove & cache drop in between): WRITE: io=1024.0MB, aggrb=16322KB/s, minb=16713KB/s, maxb=16713KB/s, mint=64243msec, maxt=64243msec WRITE: io=1024.0MB, aggrb=16249KB/s, minb=16639KB/s, maxb=16639KB/s, mint=64528msec, maxt=64528msec WRITE: io=1024.0MB, aggrb=16370KB/s, minb=16763KB/s, maxb=16763KB/s, mint=64052msec, maxt=64052msec With the patch which exposes other users' data: WRITE: io=1024.0MB, aggrb=17840KB/s, minb=18268KB/s, maxb=18268KB/s, mint=58776msec, maxt=58776msec WRITE: io=1024.0MB, aggrb=17841KB/s, minb=18269KB/s, maxb=18269KB/s, mint=58773msec, maxt=58773msec WRITE: io=1024.0MB, aggrb=17828KB/s, minb=18255KB/s, maxb=18255KB/s, mint=58816msec, maxt=58816msec so about 10% faster than without. XFS, FWIW: WRITE: io=1024.0MB, aggrb=24008KB/s, minb=24584KB/s, maxb=24584KB/s, mint=43675msec, maxt=43675msec WRITE: io=1024.0MB, aggrb=24069KB/s, minb=24647KB/s, maxb=24647KB/s, mint=43564msec, maxt=43564msec WRITE: io=1024.0MB, aggrb=24054KB/s, minb=24632KB/s, maxb=24632KB/s, mint=43591msec, maxt=43591msec which is 35% faster than ext4 with the risky patch. Haven't yet tried overwrites or done any tracing or profiling, but I think the fio recipe is a decent demonstrator, I'll try the overwrites etc in a bit when I get a moment. -Eric -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 6/26/12 3:57 PM, Eric Sandeen wrote: > On 6/26/12 3:30 PM, Ric Wheeler wrote: > >> Thanks! Eric is also running some tests to evaluate the impact of various techniques :) >> >> ric > > Yup forgive me for interjecting actual numbers into the discussion ;) > > I tried running this fio recipe on v3.3, which I think does a decent job of > emulating the situation (fallocate 1G, do random 1M writes into it, with > fsyncs after each): > > [test] > filename=testfile > rw=randwrite > size=1g > filesize=1g > bs=1024k > ioengine=sync > fallocate=1 > fsync=1 > > Stock ext4 (3 tests w/ file remove & cache drop in between): > > WRITE: io=1024.0MB, aggrb=16322KB/s, minb=16713KB/s, maxb=16713KB/s, mint=64243msec, maxt=64243msec > WRITE: io=1024.0MB, aggrb=16249KB/s, minb=16639KB/s, maxb=16639KB/s, mint=64528msec, maxt=64528msec > WRITE: io=1024.0MB, aggrb=16370KB/s, minb=16763KB/s, maxb=16763KB/s, mint=64052msec, maxt=64052msec And as a sanity check, here are the rates for overwriting existing, written blocks in the file: WRITE: io=1024.0MB, aggrb=17778KB/s, minb=18205KB/s, maxb=18205KB/s, mint=58980msec, maxt=58980msec WRITE: io=1024.0MB, aggrb=17825KB/s, minb=18252KB/s, maxb=18252KB/s, mint=58826msec, maxt=58826msec WRITE: io=1024.0MB, aggrb=17769KB/s, minb=18195KB/s, maxb=18195KB/s, mint=59010msec, maxt=59010msec so this does look like about ~10% overhead for converting the extents. > With the patch which exposes other users' data: > > WRITE: io=1024.0MB, aggrb=17840KB/s, minb=18268KB/s, maxb=18268KB/s, mint=58776msec, maxt=58776msec > WRITE: io=1024.0MB, aggrb=17841KB/s, minb=18269KB/s, maxb=18269KB/s, mint=58773msec, maxt=58773msec > WRITE: io=1024.0MB, aggrb=17828KB/s, minb=18255KB/s, maxb=18255KB/s, mint=58816msec, maxt=58816msec overwrites: WRITE: io=1024.0MB, aggrb=17768KB/s, minb=18194KB/s, maxb=18194KB/s, mint=59014msec, maxt=59014msec WRITE: io=1024.0MB, aggrb=17855KB/s, minb=18283KB/s, maxb=18283KB/s, mint=58726msec, maxt=58726msec WRITE: io=1024.0MB, aggrb=17838KB/s, minb=18266KB/s, maxb=18266KB/s, mint=58783msec, maxt=58783msec As expected, overwriting stale data is no slower than overwriting file data. > so about 10% faster than without. > > XFS, FWIW: > > WRITE: io=1024.0MB, aggrb=24008KB/s, minb=24584KB/s, maxb=24584KB/s, mint=43675msec, maxt=43675msec > WRITE: io=1024.0MB, aggrb=24069KB/s, minb=24647KB/s, maxb=24647KB/s, mint=43564msec, maxt=43564msec > WRITE: io=1024.0MB, aggrb=24054KB/s, minb=24632KB/s, maxb=24632KB/s, mint=43591msec, maxt=43591msec overwrites: WRITE: io=1024.0MB, aggrb=24108KB/s, minb=24687KB/s, maxb=24687KB/s, mint=43494msec, maxt=43494msec WRITE: io=1024.0MB, aggrb=24129KB/s, minb=24708KB/s, maxb=24708KB/s, mint=43456msec, maxt=43456msec WRITE: io=1024.0MB, aggrb=24164KB/s, minb=24744KB/s, maxb=24744KB/s, mint=43393msec, maxt=43393msec looks like negligible overhead for the conversions here, < 1%. -Eric > which is 35% faster than ext4 with the risky patch. > > Haven't yet tried overwrites or done any tracing or profiling, but I think the fio recipe is a decent demonstrator, I'll try the overwrites etc in a bit when I get a moment. > > -Eric > -- > To unsubscribe from this list: send the line "unsubscribe linux-ext4" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 6/26/12 4:44 PM, Eric Sandeen wrote: > On 6/26/12 3:57 PM, Eric Sandeen wrote: >> On 6/26/12 3:30 PM, Ric Wheeler wrote: ... >> I tried running this fio recipe on v3.3, which I think does a decent job of >> emulating the situation (fallocate 1G, do random 1M writes into it, with >> fsyncs after each): >> >> [test] >> filename=testfile >> rw=randwrite >> size=1g >> filesize=1g >> bs=1024k >> ioengine=sync >> fallocate=1 >> fsync=1 I realized the kernel I tested on had a lot of debugging bells & whistles turned on. For a more performance-configured v3.3 kernel, I see much less impact, more or less negligible: unwritten conversion on ext4: WRITE: io=1024.0MB, aggrb=23077KB/s, minb=23631KB/s, maxb=23631KB/s, mint=45437msec, maxt=45437msec WRITE: io=1024.0MB, aggrb=23040KB/s, minb=23593KB/s, maxb=23593KB/s, mint=45511msec, maxt=45511msec WRITE: io=1024.0MB, aggrb=23011KB/s, minb=23564KB/s, maxb=23564KB/s, mint=45567msec, maxt=45567msec overwrites on ext4: WRITE: io=1024.0MB, aggrb=23046KB/s, minb=23599KB/s, maxb=23599KB/s, mint=45499msec, maxt=45499msec WRITE: io=1024.0MB, aggrb=23222KB/s, minb=23780KB/s, maxb=23780KB/s, mint=45153msec, maxt=45153msec WRITE: io=1024.0MB, aggrb=23031KB/s, minb=23584KB/s, maxb=23584KB/s, mint=45528msec, maxt=45528msec So I guess it's interesting information; the debug kernel showed a much bigger difference; the performance-config'd kernel showed almost no difference. FWIW, on this same kernel, unwritten conversion on xfs: WRITE: io=1024.0MB, aggrb=28409KB/s, minb=29091KB/s, maxb=29091KB/s, mint=36909msec, maxt=36909msec WRITE: io=1024.0MB, aggrb=28372KB/s, minb=29053KB/s, maxb=29053KB/s, mint=36958msec, maxt=36958msec WRITE: io=1024.0MB, aggrb=28406KB/s, minb=29088KB/s, maxb=29088KB/s, mint=36913msec, maxt=36913msec overwrites on xfs: WRITE: io=1024.0MB, aggrb=28268KB/s, minb=28946KB/s, maxb=28946KB/s, mint=37094msec, maxt=37094msec WRITE: io=1024.0MB, aggrb=28316KB/s, minb=28995KB/s, maxb=28995KB/s, mint=37031msec, maxt=37031msec WRITE: io=1024.0MB, aggrb=28576KB/s, minb=29262KB/s, maxb=29262KB/s, mint=36694msec, maxt=36694msec -Eric -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jun 26, 2012 at 04:44:08PM -0400, Eric Sandeen wrote: > > > > I tried running this fio recipe on v3.3, which I think does a decent job of > > emulating the situation (fallocate 1G, do random 1M writes into it, with > > fsyncs after each): > > > > [test] > > filename=testfile > > rw=randwrite > > size=1g > > filesize=1g > > bs=1024k > > ioengine=sync > > fallocate=1 > > fsync=1 A better workload would be to use a blocksize of 4k. By using a blocksize of 1024k, it's not surprising that the metadata overhead is in the noise. Try something like this; this will cause the extent tree overhead to be roughly equal to the data block I/O. [global] rw=randwrite size=128m filesize=1g bs=4k ioengine=sync fallocate=1 fsync=1 [thread1] filename=testfile - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 6/27/12 3:30 PM, Theodore Ts'o wrote: > On Tue, Jun 26, 2012 at 04:44:08PM -0400, Eric Sandeen wrote: >>> >>> I tried running this fio recipe on v3.3, which I think does a decent job of >>> emulating the situation (fallocate 1G, do random 1M writes into it, with >>> fsyncs after each): >>> >>> [test] >>> filename=testfile >>> rw=randwrite >>> size=1g >>> filesize=1g >>> bs=1024k >>> ioengine=sync >>> fallocate=1 >>> fsync=1 > > A better workload would be to use a blocksize of 4k. By using a > blocksize of 1024k, it's not surprising that the metadata overhead is > in the noise. > > Try something like this; this will cause the extent tree overhead to > be roughly equal to the data block I/O. > > [global] > rw=randwrite > size=128m > filesize=1g > bs=4k > ioengine=sync > fallocate=1 > fsync=1 > > [thread1] > filename=testfile Well, ok ... TBH I changed it to size=16m to finish in under 20m.... so here are the results: fallocate 1g, do 16m of 4k random IOs, sync after each: # for I in a b c; do rm -f testfile; echo 3 > /proc/sys/vm/drop_caches; fio tytso.fio | grep 2>&1 WRITE; done WRITE: io=16384KB, aggrb=154KB/s, minb=158KB/s, maxb=158KB/s, mint=105989msec, maxt=105989msec WRITE: io=16384KB, aggrb=163KB/s, minb=167KB/s, maxb=167KB/s, mint=99906msec, maxt=99906msec WRITE: io=16384KB, aggrb=176KB/s, minb=180KB/s, maxb=180KB/s, mint=92791msec, maxt=92791msec same, but overwrite pre-written 1g file (same as the expose-my-data option ;) # dd if=/dev/zero of=testfile bs=1M count=1024 # for I in a b c; do echo 3 > /proc/sys/vm/drop_caches; fio tytso.fio | grep 2>&1 WRITE; done WRITE: io=16384KB, aggrb=164KB/s, minb=168KB/s, maxb=168KB/s, mint=99515msec, maxt=99515msec WRITE: io=16384KB, aggrb=164KB/s, minb=168KB/s, maxb=168KB/s, mint=99371msec, maxt=99371msec WRITE: io=16384KB, aggrb=164KB/s, minb=168KB/s, maxb=168KB/s, mint=99677msec, maxt=99677msec so no great surprise, small synchronous 4k writes have terrible performance, but I'm still not seeing a lot of fallocate overhead. xfs, FWIW: # for I in a b c; do rm -f testfile; echo 3 > /proc/sys/vm/drop_caches; fio tytso.fio | grep 2>&1 WRITE; done WRITE: io=16384KB, aggrb=202KB/s, minb=207KB/s, maxb=207KB/s, mint=80980msec, maxt=80980msec WRITE: io=16384KB, aggrb=203KB/s, minb=208KB/s, maxb=208KB/s, mint=80508msec, maxt=80508msec WRITE: io=16384KB, aggrb=204KB/s, minb=208KB/s, maxb=208KB/s, mint=80291msec, maxt=80291msec # dd if=/dev/zero of=testfile bs=1M count=1024 # for I in a b c; do echo 3 > /proc/sys/vm/drop_caches; fio tytso.fio | grep 2>&1 WRITE; done WRITE: io=16384KB, aggrb=197KB/s, minb=202KB/s, maxb=202KB/s, mint=82869msec, maxt=82869msec WRITE: io=16384KB, aggrb=203KB/s, minb=208KB/s, maxb=208KB/s, mint=80348msec, maxt=80348msec WRITE: io=16384KB, aggrb=202KB/s, minb=207KB/s, maxb=207KB/s, mint=80827msec, maxt=80827msec Again, I think this is just a diabolical workload ;) -Eric -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/27/2012 07:02 PM, Eric Sandeen wrote: > On 6/27/12 3:30 PM, Theodore Ts'o wrote: >> On Tue, Jun 26, 2012 at 04:44:08PM -0400, Eric Sandeen wrote: >>>> I tried running this fio recipe on v3.3, which I think does a decent job of >>>> emulating the situation (fallocate 1G, do random 1M writes into it, with >>>> fsyncs after each): >>>> >>>> [test] >>>> filename=testfile >>>> rw=randwrite >>>> size=1g >>>> filesize=1g >>>> bs=1024k >>>> ioengine=sync >>>> fallocate=1 >>>> fsync=1 >> A better workload would be to use a blocksize of 4k. By using a >> blocksize of 1024k, it's not surprising that the metadata overhead is >> in the noise. >> >> Try something like this; this will cause the extent tree overhead to >> be roughly equal to the data block I/O. >> >> [global] >> rw=randwrite >> size=128m >> filesize=1g >> bs=4k >> ioengine=sync >> fallocate=1 >> fsync=1 >> >> [thread1] >> filename=testfile > Well, ok ... TBH I changed it to size=16m to finish in under 20m.... so here are the results: > > fallocate 1g, do 16m of 4k random IOs, sync after each: > > # for I in a b c; do rm -f testfile; echo 3 > /proc/sys/vm/drop_caches; fio tytso.fio | grep 2>&1 WRITE; done > > WRITE: io=16384KB, aggrb=154KB/s, minb=158KB/s, maxb=158KB/s, mint=105989msec, maxt=105989msec > WRITE: io=16384KB, aggrb=163KB/s, minb=167KB/s, maxb=167KB/s, mint=99906msec, maxt=99906msec > WRITE: io=16384KB, aggrb=176KB/s, minb=180KB/s, maxb=180KB/s, mint=92791msec, maxt=92791msec > > same, but overwrite pre-written 1g file (same as the expose-my-data option ;) > > # dd if=/dev/zero of=testfile bs=1M count=1024 > # for I in a b c; do echo 3 > /proc/sys/vm/drop_caches; fio tytso.fio | grep 2>&1 WRITE; done > > WRITE: io=16384KB, aggrb=164KB/s, minb=168KB/s, maxb=168KB/s, mint=99515msec, maxt=99515msec > WRITE: io=16384KB, aggrb=164KB/s, minb=168KB/s, maxb=168KB/s, mint=99371msec, maxt=99371msec > WRITE: io=16384KB, aggrb=164KB/s, minb=168KB/s, maxb=168KB/s, mint=99677msec, maxt=99677msec > > so no great surprise, small synchronous 4k writes have terrible performance, but I'm still not seeing a lot of fallocate overhead. > > xfs, FWIW: > > # for I in a b c; do rm -f testfile; echo 3 > /proc/sys/vm/drop_caches; fio tytso.fio | grep 2>&1 WRITE; done > > WRITE: io=16384KB, aggrb=202KB/s, minb=207KB/s, maxb=207KB/s, mint=80980msec, maxt=80980msec > WRITE: io=16384KB, aggrb=203KB/s, minb=208KB/s, maxb=208KB/s, mint=80508msec, maxt=80508msec > WRITE: io=16384KB, aggrb=204KB/s, minb=208KB/s, maxb=208KB/s, mint=80291msec, maxt=80291msec > > # dd if=/dev/zero of=testfile bs=1M count=1024 > # for I in a b c; do echo 3 > /proc/sys/vm/drop_caches; fio tytso.fio | grep 2>&1 WRITE; done > > WRITE: io=16384KB, aggrb=197KB/s, minb=202KB/s, maxb=202KB/s, mint=82869msec, maxt=82869msec > WRITE: io=16384KB, aggrb=203KB/s, minb=208KB/s, maxb=208KB/s, mint=80348msec, maxt=80348msec > WRITE: io=16384KB, aggrb=202KB/s, minb=207KB/s, maxb=207KB/s, mint=80827msec, maxt=80827msec > > Again, I think this is just a diabolical workload ;) > > -Eric We need to keep in mind what the goal of pre-allocation is (should be?) - spend a bit of extra time doing the allocation call so we get really good, contiguous layout on disk which ultimately will help in streaming read/write workloads. If you have a reasonably small file, pre-allocation is probably simply a waste of time - you would be better off overwriting the maximum file size with all zeros (even a 1GB file would take only a few seconds). If the file is large enough to be interesting, I think that we might want to think about a scheme that would bring small random IO's more into line with the 1MB results Eric saw. One way to do that might be to have a minimum "chunk" that we would zero out for any IO to an allocated but unwritten extent. You write 4KB to the middle of said region, we pad up and zero out to the nearest MB with zeros. Note for the target class of drives (S-ATA) that Ted mentioned earlier, doing a random 4KB write vs a 1MB write is not that much slower (you need to pay the head movement costs already). Of course, the sweet spot might turn out to be a bit smaller or larger. Ric -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jun 27, 2012 at 07:02:45PM -0400, Eric Sandeen wrote: > > fallocate 1g, do 16m of 4k random IOs, sync after each: > > # for I in a b c; do rm -f testfile; echo 3 > /proc/sys/vm/drop_caches; fio tytso.fio | grep 2>&1 WRITE; done > > WRITE: io=16384KB, aggrb=154KB/s, minb=158KB/s, maxb=158KB/s, mint=105989msec, maxt=105989msec > WRITE: io=16384KB, aggrb=163KB/s, minb=167KB/s, maxb=167KB/s, mint=99906msec, maxt=99906msec > WRITE: io=16384KB, aggrb=176KB/s, minb=180KB/s, maxb=180KB/s, mint=92791msec, maxt=92791msec > > same, but overwrite pre-written 1g file (same as the expose-my-data option ;) > > # dd if=/dev/zero of=testfile bs=1M count=1024 > # for I in a b c; do echo 3 > /proc/sys/vm/drop_caches; fio tytso.fio | grep 2>&1 WRITE; done > > WRITE: io=16384KB, aggrb=164KB/s, minb=168KB/s, maxb=168KB/s, mint=99515msec, maxt=99515msec > WRITE: io=16384KB, aggrb=164KB/s, minb=168KB/s, maxb=168KB/s, mint=99371msec, maxt=99371msec > WRITE: io=16384KB, aggrb=164KB/s, minb=168KB/s, maxb=168KB/s, mint=99677msec, maxt=99677msec > There's a pretty large range comparing the first set versus the second set of numbers. With the second set of numbers, the times are much more stable. I wonder if one of the things that's going on is file placement; if you're constantly deleting and recreating the file, are we getting the same block numbers or not? (And if we're not, is that something we need to look at vis-a-vis the block allocator --- especially with SSD's?) - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2012-06-28, at 5:27 AM, Ric Wheeler wrote: > We need to keep in mind what the goal of pre-allocation is (should be?) - spend a bit of extra time doing the allocation call so we get really good, contiguous layout on disk which ultimately will help in streaming read/write workloads. > > If you have a reasonably small file, pre-allocation is probably simply a waste of time - you would be better off overwriting the maximum file size with all zeros (even a 1GB file would take only a few seconds). > > If the file is large enough to be interesting, I think that we might want to think about a scheme that would bring small random IO's more into line with the 1MB results Eric saw. > > One way to do that might be to have a minimum "chunk" that we would zero out for any IO to an allocated but unwritten extent. You write 4KB to the middle of said region, we pad up and zero out to the nearest MB with zeros. There is already code for this in the ext4 uninit extent handling. Currently the limit is 15(?) blocks, to inflate the write size up to zero-fill a 64kB chunk on 4kB blocksize filesystems. I wouldn't object to increasing this to zero out a full 1MB (aligned) chunk under random IO cases. Yes, it would hurt the latency a small amount for the first writes (though not very much for disks, given the seek overhead), but it would avoid clobbering the extent tree so drastically, which also has long-term bad performance effects. > Note for the target class of drives (S-ATA) that Ted mentioned earlier, doing a random 4KB write vs a 1MB write is not that much slower (you need to pay the head movement costs already). Of course, the sweet spot might turn out to be a bit smaller or larger. Right, at ~100-150 seeks/sec, it is pretty close to the 150 MB/s write bandwidth limit, so the cost of doing 4kB vs. 1MB writes is a toss-up. I expect the bandwidth of drives to continue increasing, while the seek rate will stay the same. The tradeoff for SSD devices is not so clear, since inflated writes will hurt the cache, but in that case, it makes more sense to use trim-with-zero (if supported) instead of unallocated blocks anyway. Cheers, Andreas -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jun 29, 2012 at 01:02:35PM -0600, Andreas Dilger wrote: > There is already code for this in the ext4 uninit extent handling. > Currently the limit is 15(?) blocks, to inflate the write size up > to zero-fill a 64kB chunk on 4kB blocksize filesystems. I wouldn't > object to increasing this to zero out a full 1MB (aligned) chunk > under random IO cases. I agree with you that we can increase it to 1MB chunk, and I will run some tests to see the result. In addition, I am thinking whether we can provide a parameter in sysfs or some other places to adjust this value dynamically. Regards, Zheng -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jun 27, 2012 at 07:02:45PM -0400, Eric Sandeen wrote: > On 6/27/12 3:30 PM, Theodore Ts'o wrote: > > A better workload would be to use a blocksize of 4k. By using a > > blocksize of 1024k, it's not surprising that the metadata overhead is > > in the noise. > > > > Try something like this; this will cause the extent tree overhead to > > be roughly equal to the data block I/O. > > > > [global] > > rw=randwrite > > size=128m > > filesize=1g > > bs=4k > > ioengine=sync > > fallocate=1 > > fsync=1 > > > > [thread1] > > filename=testfile Hi Eric, Could you please run this test with 'journal_async_commit' flag. In my previous test, this feature can dramatically improve the performance of uninitialized extent conversion. I have sent an email to do a similar test [1]. In that email, I do a similar test and the journal_async_commit flag quite can improve the performance. 1. http://comments.gmane.org/gmane.linux.file-systems/63569 Regards, Zheng -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 07/01/2012 10:16 PM, Zheng Liu wrote: > On Wed, Jun 27, 2012 at 07:02:45PM -0400, Eric Sandeen wrote: >> On 6/27/12 3:30 PM, Theodore Ts'o wrote: >>> A better workload would be to use a blocksize of 4k. By using a >>> blocksize of 1024k, it's not surprising that the metadata overhead is >>> in the noise. >>> >>> Try something like this; this will cause the extent tree overhead to >>> be roughly equal to the data block I/O. >>> >>> [global] >>> rw=randwrite >>> size=128m >>> filesize=1g >>> bs=4k >>> ioengine=sync >>> fallocate=1 >>> fsync=1 >>> >>> [thread1] >>> filename=testfile > > Hi Eric, > > Could you please run this test with 'journal_async_commit' flag. In my > previous test, this feature can dramatically improve the performance of > uninitialized extent conversion. > > I have sent an email to do a similar test [1]. In that email, I do a > similar test and the journal_async_commit flag quite can improve the > performance. I can try to find time for that, but so far I haven't actually seen any severe impact of conversion on a non-debug kernel. And didn't Jan think that journal_async_commit was fatally flawed? At this point I think it is up to the proponents of the expose-stale-data patch to document & demonstrate the workload which justifies such a change... -Eric > 1. http://comments.gmane.org/gmane.linux.file-systems/63569 > > Regards, > Zheng -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon 02-07-12 11:33:33, Eric Sandeen wrote: > On 07/01/2012 10:16 PM, Zheng Liu wrote: > > Hi Eric, > > > > Could you please run this test with 'journal_async_commit' flag. In my > > previous test, this feature can dramatically improve the performance of > > uninitialized extent conversion. > > > > I have sent an email to do a similar test [1]. In that email, I do a > > similar test and the journal_async_commit flag quite can improve the > > performance. > > I can try to find time for that, but so far I haven't actually seen any > severe impact of conversion on a non-debug kernel. And didn't Jan think > that journal_async_commit was fatally flawed? Yes, that option is broken and basically unfixable for data=ordered mode (see http://comments.gmane.org/gmane.comp.file-systems.ext4/30727). For data=writeback it works fine AFAICT. Honza
On 07/02/2012 01:44 PM, Jan Kara wrote: > On Mon 02-07-12 11:33:33, Eric Sandeen wrote: >> On 07/01/2012 10:16 PM, Zheng Liu wrote: >>> Hi Eric, >>> >>> Could you please run this test with 'journal_async_commit' flag. In my >>> previous test, this feature can dramatically improve the performance of >>> uninitialized extent conversion. >>> >>> I have sent an email to do a similar test [1]. In that email, I do a >>> similar test and the journal_async_commit flag quite can improve the >>> performance. >> I can try to find time for that, but so far I haven't actually seen any >> severe impact of conversion on a non-debug kernel. And didn't Jan think >> that journal_async_commit was fatally flawed? > Yes, that option is broken and basically unfixable for data=ordered mode > (see http://comments.gmane.org/gmane.comp.file-systems.ext4/30727). For > data=writeback it works fine AFAICT. > > Honza I think that we need to start with the basics - let's find a specific work load that we can use to validate the performance. In Eric's testing, nothing really jumped out. What type of disk, the specific worst case IO size, etc would be great (and even better, if someone has a real world, non-synthetic app that shows this). Definitely more interesting I think to try and do the MB size extent conversion, that should be generally a good technique to minimize the overhead. thanks! Ric -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jul 02, 2012 at 07:44:21PM +0200, Jan Kara wrote: > Yes, that option is broken and basically unfixable for data=ordered mode > (see http://comments.gmane.org/gmane.comp.file-systems.ext4/30727). For > data=writeback it works fine AFAICT. The journal_async_commit option can be saved, but it requires changing how we handle stale data. What we need to do is to change things so that we update the metadata *after* the data has been written back. We do this already if the experimental dioread_nolock code is used, but currently it only works for 4k blocks. The I/O tree work will give us the infrastructure we need so we can easily update the metadata after the data blocks have been written out when we are extending or filling in a sparse hole, even when the block size != page size. (This is why we can't currently make the dioread_nolock code path the default; it would cause things to break on 1k/2k file systems, as well as 4k file systems on Power.) But once this is done, it will allow us to subsume and rip out dioread_nolock code[path, and the distinction between ordered and writeback mode. Also, the metadata checksum patches will fix the other potential problem with using journal_async_commit, which is that it adds fine-grained checksums in the journal, so we can recover more easily from a corrupted journal. So once all of this is stable, we'll be able significantly simplify the ext4 code and our testing matrix, and get all of the benefits of data=writeback, dioread_nolock, and journal_async_commit, without any of their current drawbacks. Which is why I've kept on pestering Zheng about how the I/O tree work has been coming along on the ext4 calls; it's going to enable some really cool things. :-) - Ted -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon 02-07-12 14:01:50, Ted Tso wrote: > On Mon, Jul 02, 2012 at 07:44:21PM +0200, Jan Kara wrote: > > Yes, that option is broken and basically unfixable for data=ordered mode > > (see http://comments.gmane.org/gmane.comp.file-systems.ext4/30727). For > > data=writeback it works fine AFAICT. > > The journal_async_commit option can be saved, but it requires changing > how we handle stale data. What we need to do is to change things so > that we update the metadata *after* the data has been written back. > We do this already if the experimental dioread_nolock code is used, > but currently it only works for 4k blocks. This won't save us. The problem with async commit is that in the sequence: write data wait for data write completion change metadata write transaction with changed metadata write commit block CACHE_FLUSH if you flip power switch just before CACHE_FLUSH, disk could have cached stuff so that the whole transaction made it to pernament storage but data didn't. So recovery will happily replay the transaction and make unwritten disk blocks accessible. There is no simple way around this problem except for issuing CACHE_FLUSH sometime after data writes have completed and before you write commit block... And if you ask about the complicated way ;-): You can compute data checksum, add it to the transaction and check it on replay. > The I/O tree work will give us the infrastructure we need so we can > easily update the metadata after the data blocks have been written out > when we are extending or filling in a sparse hole, even when the block > size != page size. (This is why we can't currently make the > dioread_nolock code path the default; it would cause things to break > on 1k/2k file systems, as well as 4k file systems on Power.) But once > this is done, it will allow us to subsume and rip out dioread_nolock > code[path, and the distinction between ordered and writeback mode. > > Also, the metadata checksum patches will fix the other potential > problem with using journal_async_commit, which is that it adds > fine-grained checksums in the journal, so we can recover more easily > from a corrupted journal. > > So once all of this is stable, we'll be able significantly simplify > the ext4 code and our testing matrix, and get all of the benefits of > data=writeback, dioread_nolock, and journal_async_commit, without any > of their current drawbacks. Which is why I've kept on pestering Zheng > about how the I/O tree work has been coming along on the ext4 calls; > it's going to enable some really cool things. :-) Yeah, this would be good stuff. Just it won't be enough for journal_async_commit in data=ordered mode... Honza
-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 On 07/02/2012 01:44 PM, Jan Kara wrote: > Yes, that option is broken and basically unfixable for data=ordered mode > (see http://comments.gmane.org/gmane.comp.file-systems.ext4/30727). For > data=writeback it works fine AFAICT. Does data=writeback with barriers fix this fallocate problem? I can see that writing small random bits into a very large uninitialized file would cause a lot of updates to the extent tree, but with a sufficiently large cache, shouldn't many of the small, random writes become coalesced into fewer, larger extent updates that are done at delayed flush time? Are you sure that the extent tree updates are not being done right away and blocking the writing application, instead of being delayed? -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.11 (GNU/Linux) Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/ iQEcBAEBAgAGBQJP85kgAAoJEJrBOlT6nu75pnEIAM1qX1PeoZOiWJumgv9yA0te C61813Vx2cVz5UpRZLO/YpAPTtei1ry2uLdN/TlmiH8TBc/fexipJwZGucYH7b9w iwE4u19eixYCDV8iVJYVZ6NI6dp9McKNxqrvhcrBUCRiG+/q0Qp5dsD0Gu4PAnOy rZzMvDU6Ln12nBH0M+UqE7VVZR6FR89tgpklSCDliPbDc0xWNUZQW0CRUkMVaAXu jFJlNMiMBg7Cu0QzGVj7EzY+GtFGcEEVE5Us2c04bdm6nbvS567Pq6LfwyOM5IVe q+/dwIN0Sdj3nneyPvrIfaFvQAKmtRnd6vkPxUnZgYSniq+hD0uf6J4nU1IKhw8= =i9g7 -----END PGP SIGNATURE----- -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Phillip, On Tue, Jul 03, 2012 at 09:15:16PM -0400, Phillip Susi wrote: > On 07/02/2012 01:44 PM, Jan Kara wrote: > > Yes, that option is broken and basically unfixable for data=ordered mode > > (see http://comments.gmane.org/gmane.comp.file-systems.ext4/30727). For > > data=writeback it works fine AFAICT. > > Does data=writeback with barriers fix this fallocate problem? If we only use data=writeback without 'journal_async_commit', it won't fix this problem according to my test. :-( > > I can see that writing small random bits into a very large uninitialized file would cause a lot of updates to the extent tree, but with a sufficiently large cache, shouldn't many of the small, random writes become coalesced into fewer, larger extent updates that are done at delayed flush time? Are you sure that the extent tree updates are not being done right away and blocking the writing application, instead of being delayed? Actually the workload needs to flush the data after writting a small random bits. This workload is met in our product system at Taobao. Thus, the application has to wait this write to be done. Certainly if we don't flush the data, the problem won't happen but there is a risk that we could loss our data. Regards, Zheng -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 On 07/03/2012 10:36 PM, Zheng Liu wrote: > Actually the workload needs to flush the data after writting a small > random bits. This workload is met in our product system at Taobao. > Thus, the application has to wait this write to be done. Certainly if > we don't flush the data, the problem won't happen but there is a risk > that we could loss our data. Ohh, I see now... you want lots of small, random, synchronous writes. Then I think the only way to avoid the metadata overhead is with the unsafe stale data patch. More importantly, this workload is going to have terrible performance no matter what the fs does, because even with all of the blocks initialized, you're still doing lots of seeking and not allowing the IO elevator to help. Maybe the application can be redesigned so that it does not generate such pathological IO? Or is this another case of userspace really needing access to barriers rather than using the big hammer of fsync? Is there any technical reason why a barrier flag can't be added to the aio interface? -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.11 (GNU/Linux) Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/ iQEcBAEBAgAGBQJP87NSAAoJEJrBOlT6nu75F7wH/0NrhBZyp+KwYuJpqM+hUs8o 1CSRXpw/OBOljH61kAT1zqRofMNA/6gF5EYnAB3usUiwjJk4eQu4kFCVastZrLRQ Zm3H2yuSXbeoRvuSnNBbpjCPFQYIKWHcKjt2+pov853Kb2auYSyN2T6I7qUmLu7U tMJtJekMi4HPz3snBrLGsvYRyy7rMLWwf3wd+G2SBrVNS/tKuRSRaw0FwXqDaoN9 Dc4FqWmVeicG9qQbszkal84IZo1YwSWMzeRvqjn+LER1XTJJaSzvttppBKKFsS2M FVeMwTtRHJ3wl3jWYTyN/AwDUg3Pr/PX9/xmdn8DSGQG3eo7sBlqyPBzN6QYz5w= =C2YA -----END PGP SIGNATURE----- -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jul 03, 2012 at 11:06:58PM -0400, Phillip Susi wrote: > Ohh, I see now... you want lots of small, random, synchronous writes. Then I think the only way to avoid the metadata overhead is with the unsafe stale data patch. More importantly, this workload is going to have terrible performance no matter what the fs does, because even with all of the blocks initialized, you're still doing lots of seeking and not allowing the IO elevator to help. Maybe the application can be redesigned so that it does not generate such pathological IO? Yes, I agree with you that the application should be re-designed but we cannot control it. So we have to use this big hammer. :-( > > Or is this another case of userspace really needing access to barriers rather than using the big hammer of fsync? Maybe Google meets the same problem. > > Is there any technical reason why a barrier flag can't be added to the aio interface? Sorry, I am not very familiar with it. Regards, Zheng -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 07/03/2012 11:06 PM, Phillip Susi wrote: > -----BEGIN PGP SIGNED MESSAGE----- > Hash: SHA1 > > On 07/03/2012 10:36 PM, Zheng Liu wrote: >> Actually the workload needs to flush the data after writting a small >> random bits. This workload is met in our product system at Taobao. >> Thus, the application has to wait this write to be done. Certainly if >> we don't flush the data, the problem won't happen but there is a risk >> that we could loss our data. > Ohh, I see now... you want lots of small, random, synchronous writes. Then I think the only way to avoid the metadata overhead is with the unsafe stale data patch. More importantly, this workload is going to have terrible performance no matter what the fs does, because even with all of the blocks initialized, you're still doing lots of seeking and not allowing the IO elevator to help. Maybe the application can be redesigned so that it does not generate such pathological IO? > > Or is this another case of userspace really needing access to barriers rather than using the big hammer of fsync? > > Is there any technical reason why a barrier flag can't be added to the aio interface? For reasonably sized files, you might just as well really write out the full file with "write" (do pre-allocation the old fashioned way). Performance of course depends on the size of the file, but for a 1GB file you can do this in a few seconds and prevent fragmentation and totally eliminate the performance of flipping extents. How large is the file you need to pre-allocate? How long does the job typically run (minutes? hours? days?) :) ric -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jul 04, 2012 at 08:20:07AM -0400, Ric Wheeler wrote: > On 07/03/2012 11:06 PM, Phillip Susi wrote: > For reasonably sized files, you might just as well really write out > the full file with "write" (do pre-allocation the old fashioned > way). > > Performance of course depends on the size of the file, but for a 1GB > file you can do this in a few seconds and prevent fragmentation and > totally eliminate the performance of flipping extents. > > How large is the file you need to pre-allocate? How long does the > job typically run (minutes? hours? days?) :) We have over one thousand servers with ten or more 2T SATA disks, and we need to pre-allocate a lot of files with fixed size until the space of every disks is occupied when the application starts up on first time. Meanwhile this number is increasing every months. It will be absolutely a nightmare for SA if we use the old fashioned way to do pre-allocation. Regards, Zheng -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index aaaece6..ac7aa42 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1240,6 +1240,9 @@ struct ext4_sb_info { unsigned long s_mb_last_group; unsigned long s_mb_last_start; + /* gid that's allowed to see stale data via falloc flag. */ + gid_t no_hide_stale_gid; + /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ atomic_t s_bal_success; /* we found long enough chunks */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index cb99346..cc57c85 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4375,6 +4375,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) int retries = 0; int flags; struct ext4_map_blocks map; + struct ext4_sb_info *sbi; unsigned int credits, blkbits = inode->i_blkbits; /* @@ -4385,12 +4386,28 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EOPNOTSUPP; /* Return error if mode is not supported */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_NO_HIDE_STALE)) + return -EOPNOTSUPP; + + /* The combination of NO_HIDE_STALE and KEEP_SIZE is not supported */ + if ((mode & FALLOC_FL_NO_HIDE_STALE) && + (mode & FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) return ext4_punch_hole(file, offset, len); + sbi = EXT4_SB(inode->i_sb); + /* Must have RAWIO to see stale data. */ + if ((mode & FALLOC_FL_NO_HIDE_STALE) && + !in_egroup_p(sbi->no_hide_stale_gid)) + return -EACCES; + + /* preallocation to directories is currently not supported */ + if (S_ISDIR(inode->i_mode)) + return -ENODEV; + trace_ext4_fallocate_enter(inode, offset, len, mode); map.m_lblk = offset >> blkbits; /* @@ -4429,6 +4446,8 @@ retry: ret = PTR_ERR(handle); break; } + if (mode & FALLOC_FL_NO_HIDE_STALE) + flags &= ~EXT4_GET_BLOCKS_UNINIT_EXT; ret = ext4_map_blocks(handle, inode, &map, flags); if (ret <= 0) { #ifdef EXT4FS_DEBUG diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5b443a8..d976ec1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1175,6 +1175,8 @@ static int ext4_show_options(struct seq_file *seq, struct dentry *root) if (test_opt2(sb, BIG_EXT)) seq_puts(seq, ",big_extent"); #endif + if (sbi->no_hide_stale_gid != -1) + seq_printf(seq, ",nohide_stale_gid=%u", sbi->no_hide_stale_gid); ext4_show_quota_options(seq, sb); @@ -1353,6 +1355,7 @@ enum { #ifdef CONFIG_EXT4_BIG_EXTENT Opt_big_extent, Opt_nobig_extent, #endif + Opt_nohide_stale_gid, }; static const match_table_t tokens = { @@ -1432,6 +1435,7 @@ static const match_table_t tokens = { {Opt_big_extent, "big_extent"}, {Opt_nobig_extent, "nobig_extent"}, #endif + {Opt_nohide_stale_gid, "nohide_stale_gid=%u"}, {Opt_err, NULL}, }; @@ -1931,6 +1935,12 @@ set_qf_format: return 0; sbi->s_li_wait_mult = option; break; + case Opt_nohide_stale_gid: + if (match_int(&args[0], &option)) + return 0; + /* -1 for disabled, otherwise it's valid. */ + sbi->no_hide_stale_gid = option; + break; case Opt_noinit_itable: clear_opt(sb, INIT_INODE_TABLE); break; @@ -3274,6 +3284,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_EXT4_BIG_EXTENT sbi->s_min_big_ext_size = EXT4_DEFAULT_MIN_BIG_EXT_SIZE; #endif + /* Default to having no-hide-stale disabled. */ + sbi->no_hide_stale_gid = -1; if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) set_opt(sb, BARRIER); diff --git a/fs/open.c b/fs/open.c index 201431a..4edc0cd 100644 --- a/fs/open.c +++ b/fs/open.c @@ -224,7 +224,9 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EINVAL; /* Return error if mode is not supported */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | + FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_NO_HIDE_STALE)) return -EOPNOTSUPP; /* Punch hole must have keep size set */ diff --git a/include/linux/falloc.h b/include/linux/falloc.h index 73e0b62..a2489ac 100644 --- a/include/linux/falloc.h +++ b/include/linux/falloc.h @@ -3,6 +3,7 @@ #define FALLOC_FL_KEEP_SIZE 0x01 /* default is extend size */ #define FALLOC_FL_PUNCH_HOLE 0x02 /* de-allocates range */ +#define FALLOC_FL_NO_HIDE_STALE 0x04 /* default is hide stale data */ #ifdef __KERNEL__