diff mbox

ext4: fall back to vmalloc() for large allocations

Message ID 1310506846-25843-1-git-send-email-adilger@whamcloud.com
State Superseded, archived
Headers show

Commit Message

Andreas Dilger July 12, 2011, 9:40 p.m. UTC
For very large ext4 filesystems (128TB and larger) kmalloc() of
some per-group structures can fail at mount time due to memory
fragmentation.  If kmalloc() fails, fall back to vmalloc() for
the s_group_info and s_group_desc arrays.

Signed-off-by: Yu Jian <yujian@whamcloud.com>
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
---
 fs/ext4/mballoc.c |   49 +++++++++++++++++++++++++++++++++----------------
 fs/ext4/super.c   |   29 +++++++++++++++++++++++------
 2 files changed, 56 insertions(+), 22 deletions(-)

Comments

Theodore Ts'o July 18, 2011, 1:25 a.m. UTC | #1
On Tue, Jul 12, 2011 at 03:40:46PM -0600, Andreas Dilger wrote:
> For very large ext4 filesystems (128TB and larger) kmalloc() of
> some per-group structures can fail at mount time due to memory
> fragmentation.  If kmalloc() fails, fall back to vmalloc() for
> the s_group_info and s_group_desc arrays.
> 
> Signed-off-by: Yu Jian <yujian@whamcloud.com>
> Signed-off-by: Andreas Dilger <adilger@whamcloud.com>

Andras, was this patch authored by Yu Jian or by you?

>  	sbi->s_buddy_cache = new_inode(sb);
>  	if (sbi->s_buddy_cache == NULL) {
> -		printk(KERN_ERR "EXT4-fs: can't get new inode\n");
> +		ext4_msg(sb, KERN_ERR, "can't get new inode\n");
>  		goto err_freesgi;
>  	}

Using ext4_msg instead of printk is good, but that really should be a
separate patch.


> -	sbi->s_buddy_cache->i_ino = get_next_ino();
> +	/* To avoid potentially colliding with an valid on-disk inode number,
> +	 * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
> +	 * not in the inode hash, so it should never be found by iget(), but
> +	 * this will avoid confusion if it ever shows up during debugging. */
> +	sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;

This should be a separate patch.

> @@ -2457,12 +2472,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
>  		i++;
>  	} while (i <= sb->s_blocksize_bits + 1);
>  
> -	/* init file for buddy data */
> -	ret = ext4_mb_init_backend(sb);
> -	if (ret != 0) {
> -		goto out;
> -	}
> -
>  	spin_lock_init(&sbi->s_md_lock);
>  	spin_lock_init(&sbi->s_bal_lock);
>  
> @@ -2487,6 +2496,11 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
>  		spin_lock_init(&lg->lg_prealloc_lock);
>  	}
>  
> +	/* init file for buddy data */
> +	ret = ext4_mb_init_backend(sb);
> +	if (ret != 0)
> +		goto out;
> +
>  	if (sbi->s_proc)
>  		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
>  				 &ext4_mb_seq_groups_fops, sb);

Why are you moving ext4_mb_init_backend()?  This should be a separate
patch, with an explanation of what is going on....

       	       		      	      	    - Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger July 18, 2011, 5:12 a.m. UTC | #2
On 2011-07-17, at 7:25 PM, Ted Ts'o wrote:
> On Tue, Jul 12, 2011 at 03:40:46PM -0600, Andreas Dilger wrote:
>> For very large ext4 filesystems (128TB and larger) kmalloc() of
>> some per-group structures can fail at mount time due to memory
>> fragmentation.  If kmalloc() fails, fall back to vmalloc() for
>> the s_group_info and s_group_desc arrays.
>> 
>> Signed-off-by: Yu Jian <yujian@whamcloud.com>
>> Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
> 
> Andreas, was this patch authored by Yu Jian or by you?

Yu Jian wrote it with help from me, and I made some modifications afterward,
so I put both of our names down.

>> 	sbi->s_buddy_cache = new_inode(sb);
>> 	if (sbi->s_buddy_cache == NULL) {
>> -		printk(KERN_ERR "EXT4-fs: can't get new inode\n");
>> +		ext4_msg(sb, KERN_ERR, "can't get new inode\n");
>> 		goto err_freesgi;
>> 	}
> 
> Using ext4_msg instead of printk is good, but that really should be a
> separate patch.

> 
>> -	sbi->s_buddy_cache->i_ino = get_next_ino();
>> +	/* To avoid potentially colliding with an valid on-disk inode number,
>> +	 * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
>> +	 * not in the inode hash, so it should never be found by iget(), but
>> +	 * this will avoid confusion if it ever shows up during debugging. */
>> +	sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
> 
> This should be a separate patch.

Sure.

>> @@ -2457,12 +2472,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
>> 		i++;
>> 	} while (i <= sb->s_blocksize_bits + 1);
>> 
>> -	/* init file for buddy data */
>> -	ret = ext4_mb_init_backend(sb);
>> -	if (ret != 0) {
>> -		goto out;
>> -	}
>> -
>> 	spin_lock_init(&sbi->s_md_lock);
>> 	spin_lock_init(&sbi->s_bal_lock);
>> 
>> @@ -2487,6 +2496,11 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
>> 		spin_lock_init(&lg->lg_prealloc_lock);
>> 	}
>> 
>> +	/* init file for buddy data */
>> +	ret = ext4_mb_init_backend(sb);
>> +	if (ret != 0)
>> +		goto out;
>> +
>> 	if (sbi->s_proc)
>> 		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
>> 				 &ext4_mb_seq_groups_fops, sb);
> 
> Why are you moving ext4_mb_init_backend()?  This should be a separate
> patch, with an explanation of what is going on....

In ext4_mb_init(), if the s_locality_group allocation fails it will
currently cause the allocations made in ext4_mb_init_backend() to
be leaked.  Moving the ext4_mb_init_backend() allocation after the
s_locality_group allocation avoids that problem.

Cheers, Andreas
--
Andreas Dilger 
Principal Engineer
Whamcloud, Inc.



--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o July 18, 2011, 11:36 a.m. UTC | #3
On Sun, Jul 17, 2011 at 11:12:34PM -0600, Andreas Dilger wrote:
> 
> Yu Jian wrote it with help from me, and I made some modifications afterward,
> so I put both of our names down.

OK, I just wanted to know who I should credit as the author in Git.
If you think it should be Yu Jian, could you put in a From: Yu Jian
line?  If you think you made sufficient modifications that you think
it's mainly your code, that's fine.  I just wasn't sure which would
have been more correct, and whether you might have just forgotten an
explicit From: header or not.  Just want to give credit where credit
is due!  :-)

I'm going to mark this patch as "revisions requested" in patchworkand
want for you to send out a broken out patch series.

Many thanks!!

					- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o Aug. 1, 2011, 1:13 p.m. UTC | #4
This patch series breaks up Andreas and Yu Jian's patch "fall back to
vmalloc() for large allocations" into 5 patches, with the following
changes:

1) I created new helper functions, ext4_kvmalloc(), ext4_kzalloc, and
	ext4_kvfree() to simplify the code.

2) Fixed the patch so that online resize would work correctly if
	s_group_info and s_group_desc were allocated using vmalloc().
	(Yeah, there are other reasons why online resize won't work for
	such large allocations, but let's not add to the problems; also,
	there's always the possibility that kmalloc might fail for even
	for a small allocation.)

3) Fixed many more places in mballoc.c where ext4_msg() would be better
	than just printk().

I was waiting for Andreas and/or Yu Jian to resubmit, but since they
didn't, and I wanted these fixes, I decided to break up the patch
myself.

						- Ted

Theodore Ts'o (3):
  ext4: introduce ext4_kvmalloc(), ext4_kzalloc(), and ext4_kvfree()
  ext4: use ext4_kvzalloc()/ext4_kvmalloc() for s_group_desc and
    s_group_info
  ext4: use ext4_msg() instead of printk in mballoc

Yu Jian (2):
  ext4: use EXT4_BAD_INO for buddy cache to avoid colliding with valid
    inode #
  ext4: prevent memory leaks from ext4_mb_init_backend() on error path

 fs/ext4/ext4.h    |    3 ++
 fs/ext4/mballoc.c |  103 +++++++++++++++++++++++++++++------------------------
 fs/ext4/resize.c  |   13 ++++---
 fs/ext4/super.c   |   63 +++++++++++++++++++++-----------
 4 files changed, 107 insertions(+), 75 deletions(-)
diff mbox

Patch

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 6ed859d..72c5796 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2325,25 +2325,37 @@  static int ext4_mb_init_backend(struct super_block *sb)
 	while (array_size < sizeof(*sbi->s_group_info) *
 	       num_meta_group_infos_max)
 		array_size = array_size << 1;
-	/* An 8TB filesystem with 64-bit pointers requires a 4096 byte
-	 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
-	 * So a two level scheme suffices for now. */
+	/* A 16TB filesystem with 64-bit pointers requires an 8192 byte
+	 * kmalloc(). Filesystems larger than 2^32 blocks (16TB normally)
+	 * have group descriptors at least twice as large (64 bytes or
+	 * more vs. 32 bytes for traditional ext3 filesystems), so a 128TB
+	 * filesystem needs a 128kB allocation, which may need vmalloc(). */
 	sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
 	if (sbi->s_group_info == NULL) {
-		printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
-		return -ENOMEM;
+		sbi->s_group_info = vmalloc(array_size);
+		if (sbi->s_group_info != NULL) {
+			memset(sbi->s_group_info, 0, array_size);
+		} else {
+			ext4_msg(sb, KERN_ERR, "no memory for groupinfo (%u)\n",
+				 array_size);
+			return -ENOMEM;
+		}
 	}
 	sbi->s_buddy_cache = new_inode(sb);
 	if (sbi->s_buddy_cache == NULL) {
-		printk(KERN_ERR "EXT4-fs: can't get new inode\n");
+		ext4_msg(sb, KERN_ERR, "can't get new inode\n");
 		goto err_freesgi;
 	}
-	sbi->s_buddy_cache->i_ino = get_next_ino();
+	/* To avoid potentially colliding with an valid on-disk inode number,
+	 * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
+	 * not in the inode hash, so it should never be found by iget(), but
+	 * this will avoid confusion if it ever shows up during debugging. */
+	sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
 	EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
 	for (i = 0; i < ngroups; i++) {
 		desc = ext4_get_group_desc(sb, i, NULL);
 		if (desc == NULL) {
-			printk(KERN_ERR
+			ext4_msg(sb, KERN_ERR,
 				"EXT4-fs: can't read descriptor %u\n", i);
 			goto err_freebuddy;
 		}
@@ -2362,7 +2374,10 @@  err_freebuddy:
 		kfree(sbi->s_group_info[i]);
 	iput(sbi->s_buddy_cache);
 err_freesgi:
-	kfree(sbi->s_group_info);
+	if (is_vmalloc_addr(sbi->s_group_info))
+		vfree(sbi->s_group_info);
+	else
+		kfree(sbi->s_group_info);
 	return -ENOMEM;
 }
 
@@ -2457,12 +2472,6 @@  int ext4_mb_init(struct super_block *sb, int needs_recovery)
 		i++;
 	} while (i <= sb->s_blocksize_bits + 1);
 
-	/* init file for buddy data */
-	ret = ext4_mb_init_backend(sb);
-	if (ret != 0) {
-		goto out;
-	}
-
 	spin_lock_init(&sbi->s_md_lock);
 	spin_lock_init(&sbi->s_bal_lock);
 
@@ -2487,6 +2496,11 @@  int ext4_mb_init(struct super_block *sb, int needs_recovery)
 		spin_lock_init(&lg->lg_prealloc_lock);
 	}
 
+	/* init file for buddy data */
+	ret = ext4_mb_init_backend(sb);
+	if (ret != 0)
+		goto out;
+
 	if (sbi->s_proc)
 		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
 				 &ext4_mb_seq_groups_fops, sb);
@@ -2544,7 +2558,10 @@  int ext4_mb_release(struct super_block *sb)
 			EXT4_DESC_PER_BLOCK_BITS(sb);
 		for (i = 0; i < num_meta_group_infos; i++)
 			kfree(sbi->s_group_info[i]);
-		kfree(sbi->s_group_info);
+		if (is_vmalloc_addr(sbi->s_group_info))
+			vfree(sbi->s_group_info);
+		else
+			kfree(sbi->s_group_info);
 	}
 	kfree(sbi->s_mb_offsets);
 	kfree(sbi->s_mb_maxs);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9ea71aa..556084b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -789,7 +789,12 @@  static void ext4_put_super(struct super_block *sb)
 
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
-	kfree(sbi->s_group_desc);
+
+	if (is_vmalloc_addr(sbi->s_group_desc))
+		vfree(sbi->s_group_desc);
+	else
+		kfree(sbi->s_group_desc);
+
 	if (is_vmalloc_addr(sbi->s_flex_groups))
 		vfree(sbi->s_flex_groups);
 	else
@@ -3059,6 +3064,7 @@  static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	int ret = -ENOMEM;
 	int blocksize;
 	unsigned int db_count;
+	size_t size;
 	unsigned int i;
 	int needs_recovery, has_huge_files;
 	__u64 blocks_count;
@@ -3408,11 +3414,18 @@  static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
 	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
 		   EXT4_DESC_PER_BLOCK(sb);
-	sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
-				    GFP_KERNEL);
+	size = (size_t)db_count * sizeof(struct buffer_head *);
+	sbi->s_group_desc = kzalloc(size, GFP_KERNEL);
 	if (sbi->s_group_desc == NULL) {
-		ext4_msg(sb, KERN_ERR, "not enough memory");
-		goto failed_mount;
+		sbi->s_group_desc = vmalloc(size);
+		if (sbi->s_group_desc != NULL) {
+			memset(sbi->s_group_desc, 0, size);
+		} else {
+			ext4_msg(sb, KERN_ERR, "no memory for %u groups (%u)\n",
+				 sbi->s_groups_count, (unsigned int)size);
+			ret = -ENOMEM;
+			goto failed_mount;
+	 	}
 	}
 
 #ifdef CONFIG_PROC_FS
@@ -3756,7 +3769,11 @@  failed_mount3:
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
-	kfree(sbi->s_group_desc);
+
+	if (is_vmalloc_addr(sbi->s_group_desc))
+		vfree(sbi->s_group_desc);
+	else
+		kfree(sbi->s_group_desc);
 failed_mount:
 	if (sbi->s_proc) {
 		remove_proc_entry(sb->s_id, ext4_proc_root);