Patchwork ext4: add noorlov parameter to avoid spreading of directory inodes

login
register
mail settings
Submitter Benjamin LaHaise
Date Oct. 1, 2013, 4:08 p.m.
Message ID <20131001160817.GA2295@kvack.org>
Download mbox | patch
Permalink /patch/279497/
State New
Headers show

Comments

Benjamin LaHaise - Oct. 1, 2013, 4:08 p.m.
While investigating a performance regression during migration of the
Solace product from an older kernel running ext3 to a 3.x kernel running
ext4, the change in allocation policies between ext3 and ext4 were found
to have caused a 10-50% decrease (depending on the test) in I/O
throughput.  In order to extract more parallelism from the filesystem,
this particular use-case has 100 subdirectories off of the root
directory of an ext4 filesystem in which files are created in a
round-robin fashion.  The subdirectories are used in order to increase
the number of metadata operations that can occur in parallel.  With the
older setup on ext3, files were created sequentially, while using ext4
resulted in the files being spread out across block groups.

To avoid this change in allocation policies, introduce the noorlov mount
parameter to ext4.  This parameter changes allocation policy such that new
subdirectories in the filesystem are allocated in the same block group
as the parent subdirectory.  With the subdirectories in the same block
group, the allocation policy once again results in files being laid out
sequentially on disk, restoring performance.

Signed-off-by: Benjamin LaHaise <ben.lahaise@solacesystems.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara - Oct. 2, 2013, 2:47 p.m.
On Tue 01-10-13 12:08:17, Benjamin LaHaise wrote:
> While investigating a performance regression during migration of the
> Solace product from an older kernel running ext3 to a 3.x kernel running
> ext4, the change in allocation policies between ext3 and ext4 were found
> to have caused a 10-50% decrease (depending on the test) in I/O
> throughput.  In order to extract more parallelism from the filesystem,
> this particular use-case has 100 subdirectories off of the root
> directory of an ext4 filesystem in which files are created in a
> round-robin fashion.  The subdirectories are used in order to increase
> the number of metadata operations that can occur in parallel.  With the
> older setup on ext3, files were created sequentially, while using ext4
> resulted in the files being spread out across block groups.
> 
> To avoid this change in allocation policies, introduce the noorlov mount
> parameter to ext4.  This parameter changes allocation policy such that new
> subdirectories in the filesystem are allocated in the same block group
> as the parent subdirectory.  With the subdirectories in the same block
> group, the allocation policy once again results in files being laid out
> sequentially on disk, restoring performance.
  Frankly, I'm not very fond of a mount option for tweaking inode allocation
policy. OTOH the regression is large enough that we should address it
somehow.

So I suppose if your application doesn't use the root directory as a base
but some other directory on ext4 filesystem, everything is OK, isn't it?
Because the root directory is special in the Orlov allocator and that is
where the randomness happens.

If I'm right about the source of the problem, we could use TOPDIR inode
flag to handle this. Currently Orlov allocator treats directories with
TOPDIR flag set the same way as the root directory. Sadly the root
directory itself is hardcoded in the allocator but we could remove that
just keep the check for TOPDIR flag. To handle backward compatibility,
we would set TOPDIR for root inode during mount first time we mount the fs
with the new kernel (needs some flag in the superblock).

Hum, so when I wrote this I'm not sure this is that much better than a
mount option. But it's a possibility :). What do others think?

								Honza

> 
> Signed-off-by: Benjamin LaHaise <ben.lahaise@solacesystems.com>
> Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index af815ea..3894ab0 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -985,6 +985,8 @@ struct ext4_inode_info {
>  #define EXT4_MOUNT2_STD_GROUP_SIZE	0x00000002 /* We have standard group
>  						      size of blocksize * 8
>  						      blocks */
> +#define EXT4_MOUNT2_NO_ORLOV		0x00000004 /* Disable orlov for inode
> +						      allocation */
>  
>  #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
>  						~EXT4_MOUNT_##opt
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index 137193f..2b1b4ee 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -745,7 +745,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
>  		goto got_group;
>  	}
>  
> -	if (S_ISDIR(mode))
> +	if (!test_opt2(sb, NO_ORLOV) && S_ISDIR(mode))
>  		ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
>  	else
>  		ret2 = find_group_other(sb, dir, &group, mode);
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 2c2e6cb..d0bdcd7 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1143,7 +1143,7 @@ enum {
>  	Opt_inode_readahead_blks, Opt_journal_ioprio,
>  	Opt_dioread_nolock, Opt_dioread_lock,
>  	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
> -	Opt_max_dir_size_kb,
> +	Opt_max_dir_size_kb, Opt_noorlov,
>  };
>  
>  static const match_table_t tokens = {
> @@ -1163,6 +1163,7 @@ static const match_table_t tokens = {
>  	{Opt_debug, "debug"},
>  	{Opt_removed, "oldalloc"},
>  	{Opt_removed, "orlov"},
> +	{Opt_noorlov, "noorlov"},
>  	{Opt_user_xattr, "user_xattr"},
>  	{Opt_nouser_xattr, "nouser_xattr"},
>  	{Opt_acl, "acl"},
> @@ -1341,6 +1342,7 @@ static const struct mount_opts {
>  	int	token;
>  	int	mount_opt;
>  	int	flags;
> +	int	mount_opt2;
>  } ext4_mount_opts[] = {
>  	{Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
>  	{Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
> @@ -1417,6 +1419,7 @@ static const struct mount_opts {
>  	{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
>  	{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
>  	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
> +	{Opt_noorlov, 0, MOPT_SET, EXT4_MOUNT2_NO_ORLOV},
>  	{Opt_err, 0, 0}
>  };
>  
> @@ -1601,6 +1604,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
>  		} else {
>  			clear_opt(sb, DATA_FLAGS);
>  			sbi->s_mount_opt |= m->mount_opt;
> +			sbi->s_mount_opt2 |= m->mount_opt2;
>  		}
>  #ifdef CONFIG_QUOTA
>  	} else if (m->flags & MOPT_QFMT) {
> @@ -1630,10 +1634,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
>  			WARN_ON(1);
>  			return -1;
>  		}
> -		if (arg != 0)
> +		if (arg != 0) {
>  			sbi->s_mount_opt |= m->mount_opt;
> -		else
> +			sbi->s_mount_opt2 |= m->mount_opt2;
> +		} else {
>  			sbi->s_mount_opt &= ~m->mount_opt;
> +			sbi->s_mount_opt2 &= ~m->mount_opt2;
> +		}
>  	}
>  	return 1;
>  }
> @@ -1777,11 +1784,15 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
>  		if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
>  		    (m->flags & MOPT_CLEAR_ERR))
>  			continue;
> -		if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
> +		if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)) &&
> +		    !(m->mount_opt2 & sbi->s_mount_opt2))
>  			continue; /* skip if same as the default */
> -		if ((want_set &&
> -		     (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
> -		    (!want_set && (sbi->s_mount_opt & m->mount_opt)))
> +		if (want_set &&
> +		    (((sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
> +		     ((sbi->s_mount_opt2 & m->mount_opt2) != m->mount_opt2)))
> +			continue; /* select Opt_noFoo vs Opt_Foo */
> +		if (!want_set && ((sbi->s_mount_opt & m->mount_opt) ||
> +				  (sbi->s_mount_opt2 & m->mount_opt2)))
>  			continue; /* select Opt_noFoo vs Opt_Foo */
>  		SEQ_OPTS_PRINT("%s", token2str(m->token));
>  	}
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Sandeen - Oct. 2, 2013, 3:02 p.m.
On 10/2/13 9:47 AM, Jan Kara wrote:
> On Tue 01-10-13 12:08:17, Benjamin LaHaise wrote:
>> While investigating a performance regression during migration of the
>> Solace product from an older kernel running ext3 to a 3.x kernel running
>> ext4, the change in allocation policies between ext3 and ext4 were found
>> to have caused a 10-50% decrease (depending on the test) in I/O
>> throughput.  In order to extract more parallelism from the filesystem,
>> this particular use-case has 100 subdirectories off of the root
>> directory of an ext4 filesystem in which files are created in a
>> round-robin fashion.  The subdirectories are used in order to increase
>> the number of metadata operations that can occur in parallel.  With the
>> older setup on ext3, files were created sequentially, while using ext4
>> resulted in the files being spread out across block groups.
>>
>> To avoid this change in allocation policies, introduce the noorlov mount
>> parameter to ext4.  This parameter changes allocation policy such that new
>> subdirectories in the filesystem are allocated in the same block group
>> as the parent subdirectory.  With the subdirectories in the same block
>> group, the allocation policy once again results in files being laid out
>> sequentially on disk, restoring performance.
>   Frankly, I'm not very fond of a mount option for tweaking inode allocation
> policy. OTOH the regression is large enough that we should address it
> somehow.
> 
> So I suppose if your application doesn't use the root directory as a base
> but some other directory on ext4 filesystem, everything is OK, isn't it?
> Because the root directory is special in the Orlov allocator and that is
> where the randomness happens.
> 
> If I'm right about the source of the problem, we could use TOPDIR inode
> flag to handle this. Currently Orlov allocator treats directories with
> TOPDIR flag set the same way as the root directory. Sadly the root
> directory itself is hardcoded in the allocator but we could remove that
> just keep the check for TOPDIR flag. To handle backward compatibility,
> we would set TOPDIR for root inode during mount first time we mount the fs
> with the new kernel (needs some flag in the superblock).
> 
> Hum, so when I wrote this I'm not sure this is that much better than a
> mount option. But it's a possibility :). What do others think?

I'm right with you on thinking a mount option should be a last resort.

One thing I'm curious about - what changed from ext3 to ext4?  I thought
both defaulted to orlov and the same type of allocation behavior, more
or less.  I guess one change is that the "oldalloc" mount
option went away.

(if it does come back, it should probably mirror what we had before,
which was "oldalloc" not "noorlov" right?)

-Eric

> 								Honza
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lukas Czerner - Oct. 2, 2013, 3:25 p.m.
On Wed, 2 Oct 2013, Eric Sandeen wrote:

> Date: Wed, 02 Oct 2013 10:02:12 -0500
> From: Eric Sandeen <sandeen@redhat.com>
> To: Jan Kara <jack@suse.cz>
> Cc: Benjamin LaHaise <bcrl@kvack.org>, Theodore Ts'o <tytso@mit.edu>,
>     Andreas Dilger <adilger.kernel@dilger.ca>, linux-ext4@vger.kernel.org
> Subject: Re: [PATCH] ext4: add noorlov parameter to avoid spreading of
>     directory inodes
> 
> On 10/2/13 9:47 AM, Jan Kara wrote:
> > On Tue 01-10-13 12:08:17, Benjamin LaHaise wrote:
> >> While investigating a performance regression during migration of the
> >> Solace product from an older kernel running ext3 to a 3.x kernel running
> >> ext4, the change in allocation policies between ext3 and ext4 were found
> >> to have caused a 10-50% decrease (depending on the test) in I/O
> >> throughput.  In order to extract more parallelism from the filesystem,
> >> this particular use-case has 100 subdirectories off of the root
> >> directory of an ext4 filesystem in which files are created in a
> >> round-robin fashion.  The subdirectories are used in order to increase
> >> the number of metadata operations that can occur in parallel.  With the
> >> older setup on ext3, files were created sequentially, while using ext4
> >> resulted in the files being spread out across block groups.
> >>
> >> To avoid this change in allocation policies, introduce the noorlov mount
> >> parameter to ext4.  This parameter changes allocation policy such that new
> >> subdirectories in the filesystem are allocated in the same block group
> >> as the parent subdirectory.  With the subdirectories in the same block
> >> group, the allocation policy once again results in files being laid out
> >> sequentially on disk, restoring performance.
> >   Frankly, I'm not very fond of a mount option for tweaking inode allocation
> > policy. OTOH the regression is large enough that we should address it
> > somehow.
> > 
> > So I suppose if your application doesn't use the root directory as a base
> > but some other directory on ext4 filesystem, everything is OK, isn't it?
> > Because the root directory is special in the Orlov allocator and that is
> > where the randomness happens.
> > 
> > If I'm right about the source of the problem, we could use TOPDIR inode
> > flag to handle this. Currently Orlov allocator treats directories with
> > TOPDIR flag set the same way as the root directory. Sadly the root
> > directory itself is hardcoded in the allocator but we could remove that
> > just keep the check for TOPDIR flag. To handle backward compatibility,
> > we would set TOPDIR for root inode during mount first time we mount the fs
> > with the new kernel (needs some flag in the superblock).
> > 
> > Hum, so when I wrote this I'm not sure this is that much better than a
> > mount option. But it's a possibility :). What do others think?
> 
> I'm right with you on thinking a mount option should be a last resort.
> 
> One thing I'm curious about - what changed from ext3 to ext4?  I thought
> both defaulted to orlov and the same type of allocation behavior, more
> or less.  I guess one change is that the "oldalloc" mount
> option went away.
> 
> (if it does come back, it should probably mirror what we had before,
> which was "oldalloc" not "noorlov" right?)

Well, I guess I am to blame see commit
4113c4caa4f355b8ff8b7ff0510c29c9d00d30b3 which removed the old
oldalloc mount option as a part of the effort to actually reduce the
number of mount options :)

So we can either bring back oldalloc option use the TOPDIR inode
flag as Jan suggested. In this case having the inode flag seems like
a better option to me.

Thanks!
-Lukas


> 
> -Eric
> 
> > 								Honza
> > 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin LaHaise - Oct. 2, 2013, 3:31 p.m.
On Wed, Oct 02, 2013 at 10:02:12AM -0500, Eric Sandeen wrote:
> I'm right with you on thinking a mount option should be a last resort.
> 
> One thing I'm curious about - what changed from ext3 to ext4?  I thought
> both defaulted to orlov and the same type of allocation behavior, more
> or less.  I guess one change is that the "oldalloc" mount
> option went away.

> (if it does come back, it should probably mirror what we had before,
> which was "oldalloc" not "noorlov" right?)

The behaviour I'm looking for is not exactly the same as the orlov
allocator or the old allocator, but something that packs files as
closely together as possible.  Half of this can be achieved with
fallocate(), but reducing the spreading of directory inodes can only be
accomplished with changes to the filesystem itself.  The only reason
we're using multiple subdirectories is because of contention issues with
i_mutex (our application has to either fsync() the directory or mount
with dirsync to maintain consistency) during file creation and unlink().

		-ben
Jan Kara - Oct. 2, 2013, 3:57 p.m.
On Wed 02-10-13 11:31:01, Benjamin LaHaise wrote:
> On Wed, Oct 02, 2013 at 10:02:12AM -0500, Eric Sandeen wrote:
> > I'm right with you on thinking a mount option should be a last resort.
> > 
> > One thing I'm curious about - what changed from ext3 to ext4?  I thought
> > both defaulted to orlov and the same type of allocation behavior, more
> > or less.  I guess one change is that the "oldalloc" mount
> > option went away.
> 
> > (if it does come back, it should probably mirror what we had before,
> > which was "oldalloc" not "noorlov" right?)
> 
> The behaviour I'm looking for is not exactly the same as the orlov
> allocator or the old allocator, but something that packs files as
> closely together as possible.  Half of this can be achieved with
> fallocate(), but reducing the spreading of directory inodes can only be
> accomplished with changes to the filesystem itself.
  Yes, but if we disable orlov allocation by clearing TOPDIR flag, we will
allocate inodes sequentially from the group which is what you want.

> The only reason we're using multiple subdirectories is because of
> contention issues with i_mutex (our application has to either fsync() the
> directory or mount with dirsync to maintain consistency) during file
> creation and unlink().
  So i_mutex isn't held during fsync in ext4 in recent kernels. So that
won't be a source of contention anymore. But other directory operations
will be so I guess splitting files among lots of directories still makes
sence.

								Honza
Theodore Ts'o - Oct. 2, 2013, 4:23 p.m.
On Wed, Oct 02, 2013 at 10:02:12AM -0500, Eric Sandeen wrote:
> One thing I'm curious about - what changed from ext3 to ext4?  I thought
> both defaulted to orlov and the same type of allocation behavior, more
> or less.  I guess one change is that the "oldalloc" mount
> option went away.

Ext3 used an orlov style allocator as well.  The main difference
between ext4 and ext3 is the orlov allocator is now done on a
per-flexbg basis instead of per-blockgroup basis.

That is, we do the statistics based on a flex-bg basis instead of the
blockgroup basis.  As a result, I suspect Ben would see the inode
allocation behavior equivalent to ext3 if he creates the file system
using "mke2fs -t ext4 -G 1" to force the flex_bg size to 1.

Can you let me know what the size of the file system was, and mke2fs
parameters you were using for ext3 and ext4?  I have a feeling that
inode allocations weren't optimal for your use case even with ext3,
but because we now spread the inodes based on flex_bg's instead of
block groups, that's why you saw the performance degredation.

      	      	     	     	     - Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lukas Czerner - Oct. 2, 2013, 4:44 p.m.
On Wed, 2 Oct 2013, Benjamin LaHaise wrote:

> Date: Wed, 2 Oct 2013 11:31:01 -0400
> From: Benjamin LaHaise <bcrl@kvack.org>
> To: Eric Sandeen <sandeen@redhat.com>
> Cc: Jan Kara <jack@suse.cz>, Theodore Ts'o <tytso@mit.edu>,
>     Andreas Dilger <adilger.kernel@dilger.ca>, linux-ext4@vger.kernel.org
> Subject: Re: [PATCH] ext4: add noorlov parameter to avoid spreading of
>     directory inodes
> 
> On Wed, Oct 02, 2013 at 10:02:12AM -0500, Eric Sandeen wrote:
> > I'm right with you on thinking a mount option should be a last resort.
> > 
> > One thing I'm curious about - what changed from ext3 to ext4?  I thought
> > both defaulted to orlov and the same type of allocation behavior, more
> > or less.  I guess one change is that the "oldalloc" mount
> > option went away.
> 
> > (if it does come back, it should probably mirror what we had before,
> > which was "oldalloc" not "noorlov" right?)
> 
> The behaviour I'm looking for is not exactly the same as the orlov
> allocator or the old allocator, but something that packs files as
> closely together as possible.  Half of this can be achieved with
> fallocate(), but reducing the spreading of directory inodes can only be
> accomplished with changes to the filesystem itself.  The only reason
> we're using multiple subdirectories is because of contention issues with
> i_mutex (our application has to either fsync() the directory or mount
> with dirsync to maintain consistency) during file creation and unlink().

What is the frequency of unlink operation in comparison to file
creation ? There is a possible issue with the global goal cursors
s_mb_last_group and s_mb_last_start which might make your files
increasingly scattered across the disk. I've attempted to address
this problem with my patch

ext4: Try to better reuse recently freed space

What is the usual size of the files this application is creating ?

Thanks!
-Lukas

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin LaHaise - Oct. 2, 2013, 4:52 p.m.
On Wed, Oct 02, 2013 at 06:44:56PM +0200, Lukáš Czerner wrote:
> What is the frequency of unlink operation in comparison to file
> creation ? There is a possible issue with the global goal cursors
> s_mb_last_group and s_mb_last_start which might make your files
> increasingly scattered across the disk. I've attempted to address
> this problem with my patch

unlink() is 1:1 with creation.  The storage on disk is essentially used 
as an elastic buffer for the rest of the system.  That said, unlink() may 
not occur for minutes, hours or days.

In terms of actual allocation pattern on disk, using fallocate() is 
sufficient to ensure that files on disk are usually limited to 1 extent, 
and allocated immediately following each other.  Without fallocate(), I 
was seeing allocations aligned to 2048 block boundaries for 9MB files, 
which hurt performance quite a bit.

> ext4: Try to better reuse recently freed space
> 
> What is the usual size of the files this application is creating ?

It varies.  The target is 8MB, but it ranges from 5MB to 22MB.  In the 
worst case it can be as small as 4KB.

		-ben

> Thanks!
> -Lukas
Benjamin LaHaise - Oct. 2, 2013, 5:02 p.m.
On Wed, Oct 02, 2013 at 12:23:23PM -0400, Theodore Ts'o wrote:
> Ext3 used an orlov style allocator as well.  The main difference
> between ext4 and ext3 is the orlov allocator is now done on a
> per-flexbg basis instead of per-blockgroup basis.
> 
> That is, we do the statistics based on a flex-bg basis instead of the
> blockgroup basis.  As a result, I suspect Ben would see the inode
> allocation behavior equivalent to ext3 if he creates the file system
> using "mke2fs -t ext4 -G 1" to force the flex_bg size to 1.
> 
> Can you let me know what the size of the file system was, and mke2fs
> parameters you were using for ext3 and ext4?  I have a feeling that
> inode allocations weren't optimal for your use case even with ext3,
> but because we now spread the inodes based on flex_bg's instead of
> block groups, that's why you saw the performance degredation.

This may have been a bit misleading -- other parts of the system changed 
between the version running on ext3 vs ext4.  Subdirectories weren't used 
as much on ext3 as on ext4, so the effect wasn't nearly as pronounced.  
It was on further investigation that showed that the spreading of inodes 
for directories was resulting in the files being laid out in different 
block groups, which made the operation of reading/writing files to disk 
much less sequential.

The other big change in allocation between ext3 and ext4 is mballoc.  
Without fallocate() on the files, the allocator in ext4 was preferentially 
aligning files to power-of-2 block numbers.  This lead to one of our 
tests where ~9MB files were used to have gaps of ~1800 blocks between 
files (even in the same directory), which degraded transfer rates to/from 
disk thanks to the extra seeks.  But this aspect of tweaking the allocator 
was easily fixed by doing an fallocate() for the size of the file before 
writing to it.

		-ben
Lukas Czerner - Oct. 2, 2013, 5:09 p.m.
On Wed, 2 Oct 2013, Benjamin LaHaise wrote:

> Date: Wed, 2 Oct 2013 12:52:10 -0400
> From: Benjamin LaHaise <bcrl@kvack.org>
> To: Lukáš Czerner <lczerner@redhat.com>
> Cc: Eric Sandeen <sandeen@redhat.com>, Jan Kara <jack@suse.cz>,
>     Theodore Ts'o <tytso@mit.edu>, Andreas Dilger <adilger.kernel@dilger.ca>,
>     linux-ext4@vger.kernel.org
> Subject: Re: [PATCH] ext4: add noorlov parameter to avoid spreading of
>     directory inodes
> 
> On Wed, Oct 02, 2013 at 06:44:56PM +0200, Lukáš Czerner wrote:
> > What is the frequency of unlink operation in comparison to file
> > creation ? There is a possible issue with the global goal cursors
> > s_mb_last_group and s_mb_last_start which might make your files
> > increasingly scattered across the disk. I've attempted to address
> > this problem with my patch
> 
> unlink() is 1:1 with creation.  The storage on disk is essentially used 
> as an elastic buffer for the rest of the system.  That said, unlink() may 
> not occur for minutes, hours or days.
> 
> In terms of actual allocation pattern on disk, using fallocate() is 
> sufficient to ensure that files on disk are usually limited to 1 extent, 
> and allocated immediately following each other.  Without fallocate(), I 
> was seeing allocations aligned to 2048 block boundaries for 9MB files, 
> which hurt performance quite a bit.

I was not talking about internal file fragmentation but the actual
files lying further from the ideal block group they should have been
laid out.

> 
> > ext4: Try to better reuse recently freed space
> > 
> > What is the usual size of the files this application is creating ?
> 
> It varies.  The target is 8MB, but it ranges from 5MB to 22MB.  In the 
> worst case it can be as small as 4KB.

It means that you're indeed using stream allocation so this issue
would affect you.

> 
> 		-ben
> 
> > Thanks!
> > -Lukas
> 
>

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index af815ea..3894ab0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -985,6 +985,8 @@  struct ext4_inode_info {
 #define EXT4_MOUNT2_STD_GROUP_SIZE	0x00000002 /* We have standard group
 						      size of blocksize * 8
 						      blocks */
+#define EXT4_MOUNT2_NO_ORLOV		0x00000004 /* Disable orlov for inode
+						      allocation */
 
 #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
 						~EXT4_MOUNT_##opt
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 137193f..2b1b4ee 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -745,7 +745,7 @@  struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 		goto got_group;
 	}
 
-	if (S_ISDIR(mode))
+	if (!test_opt2(sb, NO_ORLOV) && S_ISDIR(mode))
 		ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
 	else
 		ret2 = find_group_other(sb, dir, &group, mode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2c2e6cb..d0bdcd7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1143,7 +1143,7 @@  enum {
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
 	Opt_dioread_nolock, Opt_dioread_lock,
 	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
-	Opt_max_dir_size_kb,
+	Opt_max_dir_size_kb, Opt_noorlov,
 };
 
 static const match_table_t tokens = {
@@ -1163,6 +1163,7 @@  static const match_table_t tokens = {
 	{Opt_debug, "debug"},
 	{Opt_removed, "oldalloc"},
 	{Opt_removed, "orlov"},
+	{Opt_noorlov, "noorlov"},
 	{Opt_user_xattr, "user_xattr"},
 	{Opt_nouser_xattr, "nouser_xattr"},
 	{Opt_acl, "acl"},
@@ -1341,6 +1342,7 @@  static const struct mount_opts {
 	int	token;
 	int	mount_opt;
 	int	flags;
+	int	mount_opt2;
 } ext4_mount_opts[] = {
 	{Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
 	{Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
@@ -1417,6 +1419,7 @@  static const struct mount_opts {
 	{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
 	{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
 	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
+	{Opt_noorlov, 0, MOPT_SET, EXT4_MOUNT2_NO_ORLOV},
 	{Opt_err, 0, 0}
 };
 
@@ -1601,6 +1604,7 @@  static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 		} else {
 			clear_opt(sb, DATA_FLAGS);
 			sbi->s_mount_opt |= m->mount_opt;
+			sbi->s_mount_opt2 |= m->mount_opt2;
 		}
 #ifdef CONFIG_QUOTA
 	} else if (m->flags & MOPT_QFMT) {
@@ -1630,10 +1634,13 @@  static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 			WARN_ON(1);
 			return -1;
 		}
-		if (arg != 0)
+		if (arg != 0) {
 			sbi->s_mount_opt |= m->mount_opt;
-		else
+			sbi->s_mount_opt2 |= m->mount_opt2;
+		} else {
 			sbi->s_mount_opt &= ~m->mount_opt;
+			sbi->s_mount_opt2 &= ~m->mount_opt2;
+		}
 	}
 	return 1;
 }
@@ -1777,11 +1784,15 @@  static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
 		if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
 		    (m->flags & MOPT_CLEAR_ERR))
 			continue;
-		if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
+		if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)) &&
+		    !(m->mount_opt2 & sbi->s_mount_opt2))
 			continue; /* skip if same as the default */
-		if ((want_set &&
-		     (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
-		    (!want_set && (sbi->s_mount_opt & m->mount_opt)))
+		if (want_set &&
+		    (((sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
+		     ((sbi->s_mount_opt2 & m->mount_opt2) != m->mount_opt2)))
+			continue; /* select Opt_noFoo vs Opt_Foo */
+		if (!want_set && ((sbi->s_mount_opt & m->mount_opt) ||
+				  (sbi->s_mount_opt2 & m->mount_opt2)))
 			continue; /* select Opt_noFoo vs Opt_Foo */
 		SEQ_OPTS_PRINT("%s", token2str(m->token));
 	}