From patchwork Wed Sep 24 16:53:33 2008 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Theodore Ts'o X-Patchwork-Id: 1314 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by ozlabs.org (Postfix) with ESMTP id 64255DDE17 for ; Thu, 25 Sep 2008 02:53:42 +1000 (EST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752211AbYIXQxk (ORCPT ); Wed, 24 Sep 2008 12:53:40 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1752346AbYIXQxj (ORCPT ); Wed, 24 Sep 2008 12:53:39 -0400 Received: from www.church-of-our-saviour.ORG ([69.25.196.31]:33172 "EHLO thunker.thunk.org" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1752211AbYIXQxg (ORCPT ); Wed, 24 Sep 2008 12:53:36 -0400 Received: from root (helo=closure.thunk.org) by thunker.thunk.org with local-esmtp (Exim 4.50 #1 (Debian)) id 1KiXcM-00037P-Dv; Wed, 24 Sep 2008 12:53:34 -0400 Received: from tytso by closure.thunk.org with local (Exim 4.69) (envelope-from ) id 1KiXcL-0004Ez-9k; Wed, 24 Sep 2008 12:53:33 -0400 From: Theodore Ts'o To: Ext4 Developers List Cc: Theodore Ts'o Subject: [PATCH 3/3] ext4: Use readahead when reading an inode from the inode table Date: Wed, 24 Sep 2008 12:53:33 -0400 Message-Id: <1222275213-16268-3-git-send-email-tytso@mit.edu> X-Mailer: git-send-email 1.5.6.1.205.ge2c7.dirty In-Reply-To: <1222275213-16268-2-git-send-email-tytso@mit.edu> References: <1222275213-16268-1-git-send-email-tytso@mit.edu> <1222275213-16268-2-git-send-email-tytso@mit.edu> X-SA-Exim-Connect-IP: X-SA-Exim-Mail-From: tytso@mit.edu X-SA-Exim-Scanned: No (on thunker.thunk.org); SAEximRunCond expanded to false Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org With modern hard drives, reading 64k takes roughly the same time as reading a 4k block. So request readahead for adjacent inode table blocks to reduce the time it takes when iterating over directories (especially when doing this in htree sort order) in a cold cache case. With this patch, the time it takes to run "git status" on a kernel tree after flushing the caches via "echo 3 > /proc/sys/vm/drop_caches" is reduced by 21%. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 + fs/ext4/ext4_sb.h | 1 + fs/ext4/inode.c | 129 ++++++++++++++++++++++++----------------------------- fs/ext4/super.c | 27 ++++++++++- 4 files changed, 87 insertions(+), 72 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 163c445..fc7ce2e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -790,6 +790,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) #define EXT4_DEF_RESUID 0 #define EXT4_DEF_RESGID 0 +#define EXT4_DEF_INODE_READAHEAD_BITS 5 + /* * Default mount options */ diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index f92af01..04e1fd2 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -52,6 +52,7 @@ struct ext4_sb_info { int s_desc_per_block_bits; int s_inode_size; int s_first_ino; + unsigned int s_inode_readahead_bits; spinlock_t s_next_gen_lock; u32 s_next_generation; u32 s_hash_seed[4]; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index eed1265..5c19604 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3833,41 +3833,6 @@ out_stop: ext4_journal_stop(handle); } -static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb, - unsigned long ino, struct ext4_iloc *iloc) -{ - ext4_group_t block_group; - unsigned long offset; - ext4_fsblk_t block; - struct ext4_group_desc *gdp; - - if (!ext4_valid_inum(sb, ino)) { - /* - * This error is already checked for in namei.c unless we are - * looking at an NFS filehandle, in which case no error - * report is needed - */ - return 0; - } - - block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); - gdp = ext4_get_group_desc(sb, block_group, NULL); - if (!gdp) - return 0; - - /* - * Figure out the offset within the block group inode table - */ - offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) * - EXT4_INODE_SIZE(sb); - block = ext4_inode_table(sb, gdp) + - (offset >> EXT4_BLOCK_SIZE_BITS(sb)); - - iloc->block_group = block_group; - iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1); - return block; -} - /* * ext4_get_inode_loc returns with an extra refcount against the inode's * underlying buffer_head on success. If 'in_mem' is true, we have all @@ -3877,19 +3842,35 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb, static int __ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc, int in_mem) { - ext4_fsblk_t block; - struct buffer_head *bh; + struct ext4_group_desc *gdp; + struct buffer_head *bh; + struct super_block *sb = inode->i_sb; + ext4_fsblk_t block; + int inodes_per_block, inode_offset; + + iloc->bh = 0; + if (!ext4_valid_inum(sb, inode->i_ino)) + return -EIO; - block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc); - if (!block) + iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); + gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); + if (!gdp) return -EIO; - bh = sb_getblk(inode->i_sb, block); + /* + * Figure out the offset within the block group inode table + */ + inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); + inode_offset = ((inode->i_ino - 1) % + EXT4_INODES_PER_GROUP(sb)); + block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); + iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); + + bh = sb_getblk(sb, block); if (!bh) { - ext4_error (inode->i_sb, "ext4_get_inode_loc", - "unable to read inode block - " - "inode=%lu, block=%llu", - inode->i_ino, block); + ext4_error(sb, "ext4_get_inode_loc", "unable to read " + "inode block - inode=%lu, block=%llu", + inode->i_ino, block); return -EIO; } if (!buffer_uptodate(bh)) { @@ -3917,28 +3898,12 @@ static int __ext4_get_inode_loc(struct inode *inode, */ if (in_mem) { struct buffer_head *bitmap_bh; - struct ext4_group_desc *desc; - int inodes_per_buffer; - int inode_offset, i; - ext4_group_t block_group; - int start; - - block_group = (inode->i_ino - 1) / - EXT4_INODES_PER_GROUP(inode->i_sb); - inodes_per_buffer = bh->b_size / - EXT4_INODE_SIZE(inode->i_sb); - inode_offset = ((inode->i_ino - 1) % - EXT4_INODES_PER_GROUP(inode->i_sb)); - start = inode_offset & ~(inodes_per_buffer - 1); + int i, start; - /* Is the inode bitmap in cache? */ - desc = ext4_get_group_desc(inode->i_sb, - block_group, NULL); - if (!desc) - goto make_io; + start = inode_offset & ~(inodes_per_block - 1); - bitmap_bh = sb_getblk(inode->i_sb, - ext4_inode_bitmap(inode->i_sb, desc)); + /* Is the inode bitmap in cache? */ + bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); if (!bitmap_bh) goto make_io; @@ -3951,14 +3916,14 @@ static int __ext4_get_inode_loc(struct inode *inode, brelse(bitmap_bh); goto make_io; } - for (i = start; i < start + inodes_per_buffer; i++) { + for (i = start; i < start + inodes_per_block; i++) { if (i == inode_offset) continue; if (ext4_test_bit(i, bitmap_bh->b_data)) break; } brelse(bitmap_bh); - if (i == start + inodes_per_buffer) { + if (i == start + inodes_per_block) { /* all other inodes are free, so skip I/O */ memset(bh->b_data, 0, bh->b_size); set_buffer_uptodate(bh); @@ -3969,6 +3934,31 @@ static int __ext4_get_inode_loc(struct inode *inode, make_io: /* + * If we need to do any I/O, try to readahead up to 16 + * blocks from the inode table. + */ + if (EXT4_SB(sb)->s_inode_readahead_bits) { + ext4_fsblk_t b, end, table; + int ra = 1 << EXT4_SB(sb)->s_inode_readahead_bits; + unsigned num; + + table = ext4_inode_table(sb, gdp); + b = block & ~(ra-1); + if (table > b) + b = table; + end = b + ra; + num = EXT4_INODES_PER_GROUP(sb); + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + num -= le16_to_cpu(gdp->bg_itable_unused); + table += num / inodes_per_block; + if (end > table) + end = table; + while (b <= end) + sb_breadahead(sb, b++); + } + + /* * There are other valid inodes in the buffer, this inode * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. @@ -3978,10 +3968,9 @@ make_io: submit_bh(READ_META, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - ext4_error(inode->i_sb, "ext4_get_inode_loc", - "unable to read inode block - " - "inode=%lu, block=%llu", - inode->i_ino, block); + ext4_error(sb, "ext4_get_inode_loc", + "unable to read inode block - inode=%lu, " + "block=%llu", inode->i_ino, block); brelse(bh); return -EIO; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6dee26d..7263b0c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -514,8 +514,10 @@ static void ext4_put_super(struct super_block *sb) BUFFER_TRACE(sbi->s_sbh, "marking dirty"); ext4_commit_super(sb, es, 1); } - if (sbi->s_proc) + if (sbi->s_proc) { + remove_proc_entry("inode_readahead_bits", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); + } for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); @@ -778,6 +780,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) seq_puts(seq, ",data=writeback"); + if (sbi->s_inode_readahead_bits != EXT4_DEF_INODE_READAHEAD_BITS) + seq_printf(seq, ",inode_readahead_bits=%d", + sbi->s_inode_readahead_bits); + ext4_show_quota_options(seq, sb); return 0; } @@ -912,6 +918,7 @@ enum { Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, + Opt_inode_readahead_bits }; static match_table_t tokens = { @@ -972,6 +979,7 @@ static match_table_t tokens = { {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, + {Opt_inode_readahead_bits, "inode_readahead_bits=%u"}, {Opt_err, NULL}, }; @@ -1380,6 +1388,13 @@ set_qf_format: case Opt_delalloc: set_opt(sbi->s_mount_opt, DELALLOC); break; + case Opt_inode_readahead_bits: + if (match_int(&args[0], &option)) + return 0; + if (option < 0 || option > 31) + return 0; + sbi->s_inode_readahead_bits = option; + break; default: printk(KERN_ERR "EXT4-fs: Unrecognized mount option \"%s\" " @@ -1937,6 +1952,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_mount_opt = 0; sbi->s_resuid = EXT4_DEF_RESUID; sbi->s_resgid = EXT4_DEF_RESGID; + sbi->s_inode_readahead_bits = EXT4_DEF_INODE_READAHEAD_BITS; sbi->s_sb_block = sb_block; unlock_kernel(); @@ -2233,6 +2249,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (ext4_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); + if (sbi->s_proc) + proc_create_data("inode_readahead_bits", 0644, sbi->s_proc, + &ext4_ui_proc_fops, + &sbi->s_inode_readahead_bits); + bgl_lock_init(&sbi->s_blockgroup_lock); for (i = 0; i < db_count; i++) { @@ -2512,8 +2533,10 @@ failed_mount2: brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); failed_mount: - if (sbi->s_proc) + if (sbi->s_proc) { + remove_proc_entry("inode_readahead_bits", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); + } #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]);