diff mbox

[-v2] ext4: add max_dir_size_kb mount option

Message ID 1344649235-9289-1-git-send-email-tytso@mit.edu
State Accepted, archived
Headers show

Commit Message

Theodore Ts'o Aug. 11, 2012, 1:40 a.m. UTC
Very large directories can cause significant performance problems, or
perhaps even invoke the OOM killer, if the process is running in a
highly constrained memory environment (whether it is VM's with a small
amount of memory or in a small memory cgroup).

So it is useful, in cloud server/data center environments, to be able
to set a filesystem-wide cap on the maximum size of a directory, to
ensure that directories never get larger than a sane size.  We do this
via a new mount option, max_dir_size_kb.  If there is an attempt to
grow the directory larger than max_dir_size_kb, the system call will
return ENOSPC instead.

Google-Bug-Id: 6863013

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/filesystems/ext4.txt | 4 ++++
 fs/ext4/ext4.h                     | 1 +
 fs/ext4/namei.c                    | 7 +++++++
 fs/ext4/super.c                    | 7 +++++++
 4 files changed, 19 insertions(+)

Comments

Andreas Dilger Aug. 11, 2012, 3:22 a.m. UTC | #1
On 2012-08-10, at 19:40, Theodore Ts'o <tytso@mit.edu> wrote:

> Very large directories can cause significant performance problems, or
> perhaps even invoke the OOM killer, if the process is running in a
> highly constrained memory environment (whether it is VM's with a small
> amount of memory or in a small memory cgroup).
> 
> So it is useful, in cloud server/data center environments, to be able
> to set a filesystem-wide cap on the maximum size of a directory, to
> ensure that directories never get larger than a sane size.  We do this
> via a new mount option, max_dir_size_kb.  If there is an attempt to
> grow the directory larger than max_dir_size_kb, the system call will
> return ENOSPC instead.

In our patch, it returns EFBIG, since it isn't really a case of being out of space for blocks or inodes.

Cheers, Andreas

> Google-Bug-Id: 6863013
> 
> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
> ---
> Documentation/filesystems/ext4.txt | 4 ++++
> fs/ext4/ext4.h                     | 1 +
> fs/ext4/namei.c                    | 7 +++++++
> fs/ext4/super.c                    | 7 +++++++
> 4 files changed, 19 insertions(+)
> 
> diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
> index 1b7f9ac..43b80b5 100644
> --- a/Documentation/filesystems/ext4.txt
> +++ b/Documentation/filesystems/ext4.txt
> @@ -375,6 +375,10 @@ dioread_nolock        locking. If the dioread_nolock option is specified
>            Because of the restrictions this options comprises
>            it is off by default (e.g. dioread_lock).
> 
> +max_dir_size_kb=n    This limits the size of directories so that any
> +            attempt to expand them beyond the specified
> +            limit in kilobytes will cause an ENOSPC error.
> +
> i_version        Enable 64-bit inode version support. This option is
>            off by default.
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index c3411d4..7c0841e 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1243,6 +1243,7 @@ struct ext4_sb_info {
>    unsigned int s_mb_order2_reqs;
>    unsigned int s_mb_group_prealloc;
>    unsigned int s_max_writeback_mb_bump;
> +    unsigned int s_max_dir_size_kb;
>    /* where last allocation was done - for stream allocation */
>    unsigned long s_mb_last_group;
>    unsigned long s_mb_last_start;
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 2a42cc0..7450ff0 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -55,6 +55,13 @@ static struct buffer_head *ext4_append(handle_t *handle,
> {
>    struct buffer_head *bh;
> 
> +    if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
> +             ((inode->i_size >> 10) >=
> +              EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) {
> +        *err = -ENOSPC;
> +        return NULL;
> +    }
> +
>    *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
> 
>    bh = ext4_bread(handle, inode, *block, 1, err);
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 56bcaec..5896dcb 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1230,6 +1230,7 @@ enum {
>    Opt_inode_readahead_blks, Opt_journal_ioprio,
>    Opt_dioread_nolock, Opt_dioread_lock,
>    Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
> +    Opt_max_dir_size_kb,
> };
> 
> static const match_table_t tokens = {
> @@ -1303,6 +1304,7 @@ static const match_table_t tokens = {
>    {Opt_init_itable, "init_itable=%u"},
>    {Opt_init_itable, "init_itable"},
>    {Opt_noinit_itable, "noinit_itable"},
> +    {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
>    {Opt_removed, "check=none"},    /* mount option from ext2/3 */
>    {Opt_removed, "nocheck"},    /* mount option from ext2/3 */
>    {Opt_removed, "reservation"},    /* mount option from ext2/3 */
> @@ -1483,6 +1485,7 @@ static const struct mount_opts {
>    {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
>    {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
>    {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
> +    {Opt_max_dir_size_kb, 0, MOPT_GTE0},
>    {Opt_err, 0, 0}
> };
> 
> @@ -1598,6 +1601,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
>            if (!args->from)
>                arg = EXT4_DEF_LI_WAIT_MULT;
>            sbi->s_li_wait_mult = arg;
> +        } else if (token == Opt_max_dir_size_kb) {
> +            sbi->s_max_dir_size_kb = arg;
>        } else if (token == Opt_stripe) {
>            sbi->s_stripe = arg;
>        } else if (m->flags & MOPT_DATAJ) {
> @@ -1829,6 +1834,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
>    if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
>               (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
>        SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
> +    if (nodefs || sbi->s_max_dir_size_kb)
> +        SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
> 
>    ext4_show_quota_options(seq, sb);
>    return 0;
> -- 
> 1.7.12.rc0.22.gcdd159b
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o Aug. 11, 2012, 7:26 p.m. UTC | #2
On Fri, Aug 10, 2012 at 09:22:39PM -0600, Andreas Dilger wrote:
> 
> In our patch, it returns EFBIG, since it isn't really a case of
> being out of space for blocks or inodes.

I agree, EFBIG seems to be a better errno.

How did you configure the directory size limit, BTW?  Did you use a
mount option, if so, what did you name it?

Regards,

					- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger Aug. 11, 2012, 9:10 p.m. UTC | #3
On 2012-08-11, at 1:26 PM, Theodore Ts'o wrote:
> On Fri, Aug 10, 2012 at 09:22:39PM -0600, Andreas Dilger wrote:
>> 
>> In our patch, it returns EFBIG, since it isn't really a case of
>> being out of space for blocks or inodes.
> 
> I agree, EFBIG seems to be a better errno.
> 
> How did you configure the directory size limit, BTW?  Did you use a
> mount option, if so, what did you name it?

It was configured via /sys/fs/ext4/{dev}/max_dir_size, but the units
were bytes instead of kbytes.  Not many sites use this value, so if
it needs to be renamed .../max_dir_size_kb it wouldn't be fatal - we
could modify our patch to allow both, and print a deprecation message
if the old one is used.  It will likely still be a few years before
distros are running 3.6 (or whatever kernel this patch is included in).
At least we wouldn't have to carry that patch in perpetuity.

Cheers, Andreas
--
Andreas Dilger                       Whamcloud, Inc.
Principal Lustre Engineer            http://www.whamcloud.com/




--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o Aug. 11, 2012, 9:13 p.m. UTC | #4
On Sat, Aug 11, 2012 at 03:26:48PM -0400, Theodore Ts'o wrote:
> On Fri, Aug 10, 2012 at 09:22:39PM -0600, Andreas Dilger wrote:
> > 
> > In our patch, it returns EFBIG, since it isn't really a case of
> > being out of space for blocks or inodes.
> 
> I agree, EFBIG seems to be a better errno.

Hmmm..... upon doing some more research, there was a related
discussion on this point on the IETF NFSv4 mailing list earlier this
year[1].  In it, Trond argued that EFBIG is defined by POSIX to mean:
"An attempt was made to write a file that exceeds the file size limit
of the process.", while ENOSPC is explicitly documented as an
allowable error code for rename(2):

[ENOSPC]
   The directory that would contain new cannot be extended.

The same definition is there for link(2) and open(2).  For open, it
reads:

[ENOSPC]
    The directory or file system that would contain the new file
    cannot be expanded, the file does not exist, and O_CREAT is
    specified.

Hence, Trond argued that using ENOSPC was a better choice, in terms of
being a closer match with POSIX specifications, and hence what
programs might expect.

The string returned by perror/strerror is going to be a little
confusing to users in either case.  EFBIG returns "File too large",
while ENOSPC returns "No space left on device".  One might argue that
ENOSPC's error return is a little better, but then again there's a
grand Unix tradition in this, after all --- "Not a typewriter",
anyone?  :-)

						- Ted

[1] http://www.ietf.org/mail-archive/web/nfsv4/current/msg10720.html

P.S.  The context for this is a feature which the NetApp filer has,
MaxDirSize, which controls the maximum size of a directory specified
in Kilobytes.  The discussion was what was the proper NFS error code
that should be returned, given how it would be reflected back to a
Posix errno by most clients.

Interestingly, it appears the default MaxDirSize starting with
NetApp's Data before ONTAP 6.5 was 64k.  On newer NetApps, the limit
is defaulted to be 1% of the memory size configured on the filer.  The
reason given for limiting the maximum directory size was for
performance reasons.  Given the issues we've seen when you have jobs
running with a 512mb memory cgroup (which would also apply if you were
running a micro EC2 instance, or some other Xen or KVM VM with a small
memory size), it's interesting that this was an issue that NetApp has
run into, and addressed the same way.

I can definitely attest to the fact that the system will not be happy
if you are limited to 512mb of memory, and you have a 176mb
directory....
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 1b7f9ac..43b80b5 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -375,6 +375,10 @@  dioread_nolock		locking. If the dioread_nolock option is specified
 			Because of the restrictions this options comprises
 			it is off by default (e.g. dioread_lock).
 
+max_dir_size_kb=n	This limits the size of directories so that any
+			attempt to expand them beyond the specified
+			limit in kilobytes will cause an ENOSPC error.
+
 i_version		Enable 64-bit inode version support. This option is
 			off by default.
 
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c3411d4..7c0841e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1243,6 +1243,7 @@  struct ext4_sb_info {
 	unsigned int s_mb_order2_reqs;
 	unsigned int s_mb_group_prealloc;
 	unsigned int s_max_writeback_mb_bump;
+	unsigned int s_max_dir_size_kb;
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
 	unsigned long s_mb_last_start;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2a42cc0..7450ff0 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -55,6 +55,13 @@  static struct buffer_head *ext4_append(handle_t *handle,
 {
 	struct buffer_head *bh;
 
+	if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
+		     ((inode->i_size >> 10) >=
+		      EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) {
+		*err = -ENOSPC;
+		return NULL;
+	}
+
 	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
 
 	bh = ext4_bread(handle, inode, *block, 1, err);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 56bcaec..5896dcb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1230,6 +1230,7 @@  enum {
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
 	Opt_dioread_nolock, Opt_dioread_lock,
 	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
+	Opt_max_dir_size_kb,
 };
 
 static const match_table_t tokens = {
@@ -1303,6 +1304,7 @@  static const match_table_t tokens = {
 	{Opt_init_itable, "init_itable=%u"},
 	{Opt_init_itable, "init_itable"},
 	{Opt_noinit_itable, "noinit_itable"},
+	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
 	{Opt_removed, "check=none"},	/* mount option from ext2/3 */
 	{Opt_removed, "nocheck"},	/* mount option from ext2/3 */
 	{Opt_removed, "reservation"},	/* mount option from ext2/3 */
@@ -1483,6 +1485,7 @@  static const struct mount_opts {
 	{Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
 	{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
 	{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
+	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
 	{Opt_err, 0, 0}
 };
 
@@ -1598,6 +1601,8 @@  static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 			if (!args->from)
 				arg = EXT4_DEF_LI_WAIT_MULT;
 			sbi->s_li_wait_mult = arg;
+		} else if (token == Opt_max_dir_size_kb) {
+			sbi->s_max_dir_size_kb = arg;
 		} else if (token == Opt_stripe) {
 			sbi->s_stripe = arg;
 		} else if (m->flags & MOPT_DATAJ) {
@@ -1829,6 +1834,8 @@  static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
 	if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
 		       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
 		SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
+	if (nodefs || sbi->s_max_dir_size_kb)
+		SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
 
 	ext4_show_quota_options(seq, sb);
 	return 0;