[v2] ext4: make xattr inode reads faster

Message ID 20170717062012.26331-1-tahsin@google.com
State Awaiting Upstream
Headers show

Commit Message

Tahsin Erdogan July 17, 2017, 6:20 a.m.
ext4_xattr_inode_read() currently reads each block sequentially while
waiting for io operation to complete before moving on to the next
block. This prevents request merging in block layer.

Add a ext4_bread_batch() function that starts reads for all blocks
then optionally waits for them to complete. A similar logic is used
in ext4_find_entry(), so update that code to use the new function.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
v2:
 - updated ext4_find_entry() to also use ext4_bread_batch()
 - added wait parameter to ext4_bread_batch()

 fs/ext4/ext4.h  |  2 ++
 fs/ext4/inode.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/namei.c | 43 ++++++++++++++-----------------------------
 fs/ext4/xattr.c | 51 ++++++++++++++++++++++++++++++++-------------------
 4 files changed, 92 insertions(+), 48 deletions(-)

Comments

Tahsin Erdogan July 17, 2017, 6:30 a.m. | #1
> It surprises me that ext4/VFS doesn't already have a helper routine to do this.
> It looks like ext4_find_entry() is doing something similar for read-ahead of
> directory blocks, so it may be worthwhile to consider moving that code over
> to use ext4_bread_batch(), but it looks like it would need to add a flag for
> whether to wait on the buffers to be uptodate or not.

Done.

> It's nice to have the parameter annotation, since it can often be confusing if
> there are multiple int/bool arguments to a function, but, I'd write this as:
>
>         ret = ext4_getblk(NULL, inode, block + i, /* map_flags = */ 0);
>
> I don't know if there is some kind of convention for this kind of comment?

I agree that yours is a bit more intuitive, however the version that
puts the comment after the value is more common in the codebase. I
prefer keeping it as is for consistency reasons. Let me know if you
still prefer the prefix version.

>
>> +             if (IS_ERR(bhs[i])) {
>> +                     err = PTR_ERR(bhs[i]);
>> +                     while (i--)
>> +                             brelse(bhs[i]);
>
> It would be better to do the error cleanup once at the end of the function,
> instead of multiple times in the code, like:
>
>                 if (IS_ERR(bhs[i])) {
>                         err = PTR_ERR(bhs[i]);
>                         bh_count = i;
>                         goto out_brelse;
>                 }

Done.


On Sun, Jul 16, 2017 at 11:20 PM, Tahsin Erdogan <tahsin@google.com> wrote:
> ext4_xattr_inode_read() currently reads each block sequentially while
> waiting for io operation to complete before moving on to the next
> block. This prevents request merging in block layer.
>
> Add a ext4_bread_batch() function that starts reads for all blocks
> then optionally waits for them to complete. A similar logic is used
> in ext4_find_entry(), so update that code to use the new function.
>
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>
> ---
> v2:
>  - updated ext4_find_entry() to also use ext4_bread_batch()
>  - added wait parameter to ext4_bread_batch()
>
>  fs/ext4/ext4.h  |  2 ++
>  fs/ext4/inode.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
>  fs/ext4/namei.c | 43 ++++++++++++++-----------------------------
>  fs/ext4/xattr.c | 51 ++++++++++++++++++++++++++++++++-------------------
>  4 files changed, 92 insertions(+), 48 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 9ebde0cd632e..e9440ed605c0 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2462,6 +2462,8 @@ extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
>  int ext4_inode_is_fast_symlink(struct inode *inode);
>  struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
>  struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
> +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
> +                    bool wait, struct buffer_head **bhs);
>  int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
>                              struct buffer_head *bh_result, int create);
>  int ext4_get_block(struct inode *inode, sector_t iblock,
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 3c600f02673f..70699940e20d 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1015,6 +1015,50 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
>         return ERR_PTR(-EIO);
>  }
>
> +/* Read a contiguous batch of blocks. */
> +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
> +                    bool wait, struct buffer_head **bhs)
> +{
> +       int i, err;
> +
> +       for (i = 0; i < bh_count; i++) {
> +               bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */);
> +               if (IS_ERR(bhs[i])) {
> +                       err = PTR_ERR(bhs[i]);
> +                       bh_count = i;
> +                       goto out_brelse;
> +               }
> +       }
> +
> +       for (i = 0; i < bh_count; i++)
> +               /* Note that NULL bhs[i] is valid because of holes. */
> +               if (bhs[i] && !buffer_uptodate(bhs[i]))
> +                       ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1,
> +                                   &bhs[i]);
> +
> +       if (!wait)
> +               return 0;
> +
> +       for (i = 0; i < bh_count; i++)
> +               if (bhs[i])
> +                       wait_on_buffer(bhs[i]);
> +
> +       for (i = 0; i < bh_count; i++) {
> +               if (bhs[i] && !buffer_uptodate(bhs[i])) {
> +                       err = -EIO;
> +                       goto out_brelse;
> +               }
> +       }
> +       return 0;
> +
> +out_brelse:
> +       for (i = 0; i < bh_count; i++) {
> +               brelse(bhs[i]);
> +               bhs[i] = NULL;
> +       }
> +       return err;
> +}
> +
>  int ext4_walk_page_buffers(handle_t *handle,
>                            struct buffer_head *head,
>                            unsigned from,
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 13f0cadb1238..2758ff1b940f 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -1342,13 +1342,12 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
>         struct super_block *sb;
>         struct buffer_head *bh_use[NAMEI_RA_SIZE];
>         struct buffer_head *bh, *ret = NULL;
> -       ext4_lblk_t start, block, b;
> +       ext4_lblk_t start, block;
>         const u8 *name = d_name->name;
> -       int ra_max = 0;         /* Number of bh's in the readahead
> +       size_t ra_max = 0;      /* Number of bh's in the readahead
>                                    buffer, bh_use[] */
> -       int ra_ptr = 0;         /* Current index into readahead
> +       size_t ra_ptr = 0;      /* Current index into readahead
>                                    buffer */
> -       int num = 0;
>         ext4_lblk_t  nblocks;
>         int i, namelen, retval;
>         struct ext4_filename fname;
> @@ -1411,31 +1410,17 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
>                 if (ra_ptr >= ra_max) {
>                         /* Refill the readahead buffer */
>                         ra_ptr = 0;
> -                       b = block;
> -                       for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
> -                               /*
> -                                * Terminate if we reach the end of the
> -                                * directory and must wrap, or if our
> -                                * search has finished at this block.
> -                                */
> -                               if (b >= nblocks || (num && block == start)) {
> -                                       bh_use[ra_max] = NULL;
> -                                       break;
> -                               }
> -                               num++;
> -                               bh = ext4_getblk(NULL, dir, b++, 0);
> -                               if (IS_ERR(bh)) {
> -                                       if (ra_max == 0) {
> -                                               ret = bh;
> -                                               goto cleanup_and_exit;
> -                                       }
> -                                       break;
> -                               }
> -                               bh_use[ra_max] = bh;
> -                               if (bh)
> -                                       ll_rw_block(REQ_OP_READ,
> -                                                   REQ_META | REQ_PRIO,
> -                                                   1, &bh);
> +                       if (block < start)
> +                               ra_max = start - block;
> +                       else
> +                               ra_max = nblocks - block;
> +                       ra_max = min(ra_max, ARRAY_SIZE(bh_use));
> +                       retval = ext4_bread_batch(dir, block, ra_max,
> +                                                 false /* wait */, bh_use);
> +                       if (retval) {
> +                               ret = ERR_PTR(retval);
> +                               ra_max = 0;
> +                               goto cleanup_and_exit;
>                         }
>                 }
>                 if ((bh = bh_use[ra_ptr++]) == NULL)
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index cff4f41ced61..68b5b4f9fbcb 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -317,28 +317,41 @@ static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
>   */
>  static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
>  {
> -       unsigned long block = 0;
> -       struct buffer_head *bh;
> -       int blocksize = ea_inode->i_sb->s_blocksize;
> -       size_t csize, copied = 0;
> -       void *copy_pos = buf;
> -
> -       while (copied < size) {
> -               csize = (size - copied) > blocksize ? blocksize : size - copied;
> -               bh = ext4_bread(NULL, ea_inode, block, 0);
> -               if (IS_ERR(bh))
> -                       return PTR_ERR(bh);
> -               if (!bh)
> -                       return -EFSCORRUPTED;
> +       int blocksize = 1 << ea_inode->i_blkbits;
> +       int bh_count = (size + blocksize - 1) >> ea_inode->i_blkbits;
> +       int tail_size = (size % blocksize) ?: blocksize;
> +       struct buffer_head *bhs_inline[8];
> +       struct buffer_head **bhs = bhs_inline;
> +       int i, ret;
> +
> +       if (bh_count > ARRAY_SIZE(bhs_inline)) {
> +               bhs = kmalloc_array(bh_count, sizeof(*bhs), GFP_NOFS);
> +               if (!bhs)
> +                       return -ENOMEM;
> +       }
>
> -               memcpy(copy_pos, bh->b_data, csize);
> -               brelse(bh);
> +       ret = ext4_bread_batch(ea_inode, 0 /* block */, bh_count,
> +                              true /* wait */, bhs);
> +       if (ret)
> +               goto free_bhs;
>
> -               copy_pos += csize;
> -               block += 1;
> -               copied += csize;
> +       for (i = 0; i < bh_count; i++) {
> +               /* There shouldn't be any holes in ea_inode. */
> +               if (!bhs[i]) {
> +                       ret = -EFSCORRUPTED;
> +                       goto put_bhs;
> +               }
> +               memcpy((char *)buf + blocksize * i, bhs[i]->b_data,
> +                      i < bh_count - 1 ? blocksize : tail_size);
>         }
> -       return 0;
> +       ret = 0;
> +put_bhs:
> +       for (i = 0; i < bh_count; i++)
> +               brelse(bhs[i]);
> +free_bhs:
> +       if (bhs != bhs_inline)
> +               kfree(bhs);
> +       return ret;
>  }
>
>  static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
> --
> 2.13.2.932.g7449e964c-goog
>
Theodore Ts'o Aug. 6, 2017, 4:14 a.m. | #2
On Sun, Jul 16, 2017 at 11:20:12PM -0700, Tahsin Erdogan wrote:
> ext4_xattr_inode_read() currently reads each block sequentially while
> waiting for io operation to complete before moving on to the next
> block. This prevents request merging in block layer.
> 
> Add a ext4_bread_batch() function that starts reads for all blocks
> then optionally waits for them to complete. A similar logic is used
> in ext4_find_entry(), so update that code to use the new function.
> 
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>

Thanks, applied.

					- Ted

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9ebde0cd632e..e9440ed605c0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2462,6 +2462,8 @@  extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
 int ext4_inode_is_fast_symlink(struct inode *inode);
 struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
+int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
+		     bool wait, struct buffer_head **bhs);
 int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
 			     struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3c600f02673f..70699940e20d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1015,6 +1015,50 @@  struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
 	return ERR_PTR(-EIO);
 }
 
+/* Read a contiguous batch of blocks. */
+int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
+		     bool wait, struct buffer_head **bhs)
+{
+	int i, err;
+
+	for (i = 0; i < bh_count; i++) {
+		bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */);
+		if (IS_ERR(bhs[i])) {
+			err = PTR_ERR(bhs[i]);
+			bh_count = i;
+			goto out_brelse;
+		}
+	}
+
+	for (i = 0; i < bh_count; i++)
+		/* Note that NULL bhs[i] is valid because of holes. */
+		if (bhs[i] && !buffer_uptodate(bhs[i]))
+			ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1,
+				    &bhs[i]);
+
+	if (!wait)
+		return 0;
+
+	for (i = 0; i < bh_count; i++)
+		if (bhs[i])
+			wait_on_buffer(bhs[i]);
+
+	for (i = 0; i < bh_count; i++) {
+		if (bhs[i] && !buffer_uptodate(bhs[i])) {
+			err = -EIO;
+			goto out_brelse;
+		}
+	}
+	return 0;
+
+out_brelse:
+	for (i = 0; i < bh_count; i++) {
+		brelse(bhs[i]);
+		bhs[i] = NULL;
+	}
+	return err;
+}
+
 int ext4_walk_page_buffers(handle_t *handle,
 			   struct buffer_head *head,
 			   unsigned from,
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 13f0cadb1238..2758ff1b940f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1342,13 +1342,12 @@  static struct buffer_head * ext4_find_entry (struct inode *dir,
 	struct super_block *sb;
 	struct buffer_head *bh_use[NAMEI_RA_SIZE];
 	struct buffer_head *bh, *ret = NULL;
-	ext4_lblk_t start, block, b;
+	ext4_lblk_t start, block;
 	const u8 *name = d_name->name;
-	int ra_max = 0;		/* Number of bh's in the readahead
+	size_t ra_max = 0;	/* Number of bh's in the readahead
 				   buffer, bh_use[] */
-	int ra_ptr = 0;		/* Current index into readahead
+	size_t ra_ptr = 0;	/* Current index into readahead
 				   buffer */
-	int num = 0;
 	ext4_lblk_t  nblocks;
 	int i, namelen, retval;
 	struct ext4_filename fname;
@@ -1411,31 +1410,17 @@  static struct buffer_head * ext4_find_entry (struct inode *dir,
 		if (ra_ptr >= ra_max) {
 			/* Refill the readahead buffer */
 			ra_ptr = 0;
-			b = block;
-			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
-				/*
-				 * Terminate if we reach the end of the
-				 * directory and must wrap, or if our
-				 * search has finished at this block.
-				 */
-				if (b >= nblocks || (num && block == start)) {
-					bh_use[ra_max] = NULL;
-					break;
-				}
-				num++;
-				bh = ext4_getblk(NULL, dir, b++, 0);
-				if (IS_ERR(bh)) {
-					if (ra_max == 0) {
-						ret = bh;
-						goto cleanup_and_exit;
-					}
-					break;
-				}
-				bh_use[ra_max] = bh;
-				if (bh)
-					ll_rw_block(REQ_OP_READ,
-						    REQ_META | REQ_PRIO,
-						    1, &bh);
+			if (block < start)
+				ra_max = start - block;
+			else
+				ra_max = nblocks - block;
+			ra_max = min(ra_max, ARRAY_SIZE(bh_use));
+			retval = ext4_bread_batch(dir, block, ra_max,
+						  false /* wait */, bh_use);
+			if (retval) {
+				ret = ERR_PTR(retval);
+				ra_max = 0;
+				goto cleanup_and_exit;
 			}
 		}
 		if ((bh = bh_use[ra_ptr++]) == NULL)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index cff4f41ced61..68b5b4f9fbcb 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -317,28 +317,41 @@  static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
  */
 static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
 {
-	unsigned long block = 0;
-	struct buffer_head *bh;
-	int blocksize = ea_inode->i_sb->s_blocksize;
-	size_t csize, copied = 0;
-	void *copy_pos = buf;
-
-	while (copied < size) {
-		csize = (size - copied) > blocksize ? blocksize : size - copied;
-		bh = ext4_bread(NULL, ea_inode, block, 0);
-		if (IS_ERR(bh))
-			return PTR_ERR(bh);
-		if (!bh)
-			return -EFSCORRUPTED;
+	int blocksize = 1 << ea_inode->i_blkbits;
+	int bh_count = (size + blocksize - 1) >> ea_inode->i_blkbits;
+	int tail_size = (size % blocksize) ?: blocksize;
+	struct buffer_head *bhs_inline[8];
+	struct buffer_head **bhs = bhs_inline;
+	int i, ret;
+
+	if (bh_count > ARRAY_SIZE(bhs_inline)) {
+		bhs = kmalloc_array(bh_count, sizeof(*bhs), GFP_NOFS);
+		if (!bhs)
+			return -ENOMEM;
+	}
 
-		memcpy(copy_pos, bh->b_data, csize);
-		brelse(bh);
+	ret = ext4_bread_batch(ea_inode, 0 /* block */, bh_count,
+			       true /* wait */, bhs);
+	if (ret)
+		goto free_bhs;
 
-		copy_pos += csize;
-		block += 1;
-		copied += csize;
+	for (i = 0; i < bh_count; i++) {
+		/* There shouldn't be any holes in ea_inode. */
+		if (!bhs[i]) {
+			ret = -EFSCORRUPTED;
+			goto put_bhs;
+		}
+		memcpy((char *)buf + blocksize * i, bhs[i]->b_data,
+		       i < bh_count - 1 ? blocksize : tail_size);
 	}
-	return 0;
+	ret = 0;
+put_bhs:
+	for (i = 0; i < bh_count; i++)
+		brelse(bhs[i]);
+free_bhs:
+	if (bhs != bhs_inline)
+		kfree(bhs);
+	return ret;
 }
 
 static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,