[1/2] ext4: try to merge unwritten extents who are also not under io

Message ID 20181125085031.7141-1-xiaoguang.wang@linux.alibaba.com
State New
Headers show
Series
  • [1/2] ext4: try to merge unwritten extents who are also not under io
Related show

Commit Message

Xiaoguang Wang Nov. 25, 2018, 8:50 a.m.
Currently in ext4_can_extents_be_merged(), if one file has unwritten
extents under io, we will not merge any other unwritten extents, even
they are not in range of those unwritten extents under io. This limit
is coarse, indeed we can merge these unwritten extents that are not
under io.

Here add a new ES_IO_B flag to track unwritten extents under io in
extents status tree. When we try to merge unwritten extents, search
given extents in extents status tree, if not found, then we can merge
these unwritten extents.

Note currently we only track unwritten extents under io.

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
---
 fs/ext4/extents.c        | 29 ++++++++++++++++++++++++++++-
 fs/ext4/extents_status.c |  9 +++++++++
 fs/ext4/extents_status.h | 10 +++++++++-
 fs/ext4/inode.c          | 10 ++++++++++
 4 files changed, 56 insertions(+), 2 deletions(-)

Comments

Xiaoguang Wang Nov. 25, 2018, 9:06 a.m. | #1
Hi,

There will be no [PATCH 2/2], which was caused by my mistake, sorry, only one patch in this mail.
I have run xfstests "-g auto" for this patch, most test cases work well, such
cases will fail:
generic/107 generic/223 generic/347 generic/388 generic/422 generic/475 generic/484, but
even though I do not apply this patch, these cases still may fail.

No matter whether this patch is applied, generic/107, generic/388 and generic/475 sometimes pass,
sometimes fail, other 4 test cases always fail.

Apply LiuBo's "Ext4: fix slow writeback under dioread_nolock and nodelalloc" and this patch, this
slow writeback issue described in Liu Bo's patch will go.

Regards,
Xiaoguang Wang

> Currently in ext4_can_extents_be_merged(), if one file has unwritten
> extents under io, we will not merge any other unwritten extents, even
> they are not in range of those unwritten extents under io. This limit
> is coarse, indeed we can merge these unwritten extents that are not
> under io.
> 
> Here add a new ES_IO_B flag to track unwritten extents under io in
> extents status tree. When we try to merge unwritten extents, search
> given extents in extents status tree, if not found, then we can merge
> these unwritten extents.
> 
> Note currently we only track unwritten extents under io.
> 
> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
> ---
>   fs/ext4/extents.c        | 29 ++++++++++++++++++++++++++++-
>   fs/ext4/extents_status.c |  9 +++++++++
>   fs/ext4/extents_status.h | 10 +++++++++-
>   fs/ext4/inode.c          | 10 ++++++++++
>   4 files changed, 56 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 240b6dea5441..a93378cd1152 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -1713,6 +1713,33 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
>   	return err;
>   }
>   
> +static int ext4_unwritten_extent_under_io(struct inode *inode,
> +			struct ext4_extent *ex1, struct ext4_extent *ex2)
> +{
> +	unsigned short len;
> +
> +	/*
> +	 * The check for IO to unwritten extent is somewhat racy as we
> +	 * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
> +	 * dropping i_data_sem. But reserved blocks should save us in that
> +	 * case.
> +	 */
> +	if (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)
> +		return 0;
> +
> +	len = ext4_ext_get_actual_len(ex1);
> +	if (ext4_es_scan_range(inode, &ext4_es_is_under_io, ex1->ee_block,
> +	    ex1->ee_block + len - 1))
> +		return 1;
> +
> +	len = ext4_ext_get_actual_len(ex2);
> +	if (ext4_es_scan_range(inode, &ext4_es_is_under_io, ex2->ee_block,
> +	    ex2->ee_block + len - 1))
> +		return 1;
> +
> +	return 0;
> +}
> +
>   int
>   ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
>   				struct ext4_extent *ex2)
> @@ -1744,7 +1771,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
>   	 */
>   	if (ext4_ext_is_unwritten(ex1) &&
>   	    (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
> -	     atomic_read(&EXT4_I(inode)->i_unwritten) ||
> +	    ext4_unwritten_extent_under_io(inode, ex1, ex2) ||
>   	     (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
>   		return 0;
>   #ifdef AGGRESSIVE_TEST
> diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
> index 2b439afafe13..04bbd8b7f8f1 100644
> --- a/fs/ext4/extents_status.c
> +++ b/fs/ext4/extents_status.c
> @@ -1332,6 +1332,15 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
>   		 */
>   		if (ext4_es_is_delayed(es))
>   			goto next;
> +
> +		/*
> +		 * we don't reclaim unwritten extent under io because we use
> +		 * it to check whether we can merge other unwritten extents
> +		 * who are not under io, and when io completes, then we can
> +		 * reclaim this extent.
> +		 */
> +		if (ext4_es_is_under_io(es))
> +			goto next;
>   		if (ext4_es_is_referenced(es)) {
>   			ext4_es_clear_referenced(es);
>   			goto next;
> diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
> index 131a8b7df265..29a985600a47 100644
> --- a/fs/ext4/extents_status.h
> +++ b/fs/ext4/extents_status.h
> @@ -36,6 +36,7 @@ enum {
>   	ES_DELAYED_B,
>   	ES_HOLE_B,
>   	ES_REFERENCED_B,
> +	ES_IO_B,
>   	ES_FLAGS
>   };
>   
> @@ -47,11 +48,13 @@ enum {
>   #define EXTENT_STATUS_DELAYED	(1 << ES_DELAYED_B)
>   #define EXTENT_STATUS_HOLE	(1 << ES_HOLE_B)
>   #define EXTENT_STATUS_REFERENCED	(1 << ES_REFERENCED_B)
> +#define EXTENT_STATUS_IO	(1 << ES_IO_B)
>   
>   #define ES_TYPE_MASK	((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
>   			  EXTENT_STATUS_UNWRITTEN | \
>   			  EXTENT_STATUS_DELAYED | \
> -			  EXTENT_STATUS_HOLE) << ES_SHIFT)
> +			  EXTENT_STATUS_HOLE | \
> +			  EXTENT_STATUS_IO) << ES_SHIFT)
>   
>   struct ext4_sb_info;
>   struct ext4_extent;
> @@ -173,6 +176,11 @@ static inline int ext4_es_is_delayed(struct extent_status *es)
>   	return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
>   }
>   
> +static inline int ext4_es_is_under_io(struct extent_status *es)
> +{
> +	return (ext4_es_type(es) & EXTENT_STATUS_IO) != 0;
> +}
> +
>   static inline int ext4_es_is_hole(struct extent_status *es)
>   {
>   	return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 22a9d8159720..516966197257 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -704,6 +704,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
>   		    ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
>   				       map->m_lblk + map->m_len - 1))
>   			status |= EXTENT_STATUS_DELAYED;
> +		/*
> +		 * track unwritten extent under io. when io completes, we'll
> +		 * convert unwritten extent to written, ext4_es_insert_extent()
> +		 * will be called again to insert this written extent, then
> +		 * EXTENT_STATUS_IO will be cleared automatically, see remove
> +		 * logic in ext4_es_insert_extent().
> +		 */
> +		if ((status & EXTENT_STATUS_UNWRITTEN) && (flags &
> +		    EXT4_GET_BLOCKS_IO_SUBMIT))
> +			status |= EXTENT_STATUS_IO;
>   		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
>   					    map->m_pblk, status);
>   		if (ret < 0) {
>
Xiaoguang Wang Dec. 4, 2018, 10:55 a.m. | #2
hi,

ping :)

Regards,
Xiaoguang Wang

> Currently in ext4_can_extents_be_merged(), if one file has unwritten
> extents under io, we will not merge any other unwritten extents, even
> they are not in range of those unwritten extents under io. This limit
> is coarse, indeed we can merge these unwritten extents that are not
> under io.
> 
> Here add a new ES_IO_B flag to track unwritten extents under io in
> extents status tree. When we try to merge unwritten extents, search
> given extents in extents status tree, if not found, then we can merge
> these unwritten extents.
> 
> Note currently we only track unwritten extents under io.
> 
> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
> ---
>   fs/ext4/extents.c        | 29 ++++++++++++++++++++++++++++-
>   fs/ext4/extents_status.c |  9 +++++++++
>   fs/ext4/extents_status.h | 10 +++++++++-
>   fs/ext4/inode.c          | 10 ++++++++++
>   4 files changed, 56 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 240b6dea5441..a93378cd1152 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -1713,6 +1713,33 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
>   	return err;
>   }
>   
> +static int ext4_unwritten_extent_under_io(struct inode *inode,
> +			struct ext4_extent *ex1, struct ext4_extent *ex2)
> +{
> +	unsigned short len;
> +
> +	/*
> +	 * The check for IO to unwritten extent is somewhat racy as we
> +	 * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
> +	 * dropping i_data_sem. But reserved blocks should save us in that
> +	 * case.
> +	 */
> +	if (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)
> +		return 0;
> +
> +	len = ext4_ext_get_actual_len(ex1);
> +	if (ext4_es_scan_range(inode, &ext4_es_is_under_io, ex1->ee_block,
> +	    ex1->ee_block + len - 1))
> +		return 1;
> +
> +	len = ext4_ext_get_actual_len(ex2);
> +	if (ext4_es_scan_range(inode, &ext4_es_is_under_io, ex2->ee_block,
> +	    ex2->ee_block + len - 1))
> +		return 1;
> +
> +	return 0;
> +}
> +
>   int
>   ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
>   				struct ext4_extent *ex2)
> @@ -1744,7 +1771,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
>   	 */
>   	if (ext4_ext_is_unwritten(ex1) &&
>   	    (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
> -	     atomic_read(&EXT4_I(inode)->i_unwritten) ||
> +	    ext4_unwritten_extent_under_io(inode, ex1, ex2) ||
>   	     (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
>   		return 0;
>   #ifdef AGGRESSIVE_TEST
> diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
> index 2b439afafe13..04bbd8b7f8f1 100644
> --- a/fs/ext4/extents_status.c
> +++ b/fs/ext4/extents_status.c
> @@ -1332,6 +1332,15 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
>   		 */
>   		if (ext4_es_is_delayed(es))
>   			goto next;
> +
> +		/*
> +		 * we don't reclaim unwritten extent under io because we use
> +		 * it to check whether we can merge other unwritten extents
> +		 * who are not under io, and when io completes, then we can
> +		 * reclaim this extent.
> +		 */
> +		if (ext4_es_is_under_io(es))
> +			goto next;
>   		if (ext4_es_is_referenced(es)) {
>   			ext4_es_clear_referenced(es);
>   			goto next;
> diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
> index 131a8b7df265..29a985600a47 100644
> --- a/fs/ext4/extents_status.h
> +++ b/fs/ext4/extents_status.h
> @@ -36,6 +36,7 @@ enum {
>   	ES_DELAYED_B,
>   	ES_HOLE_B,
>   	ES_REFERENCED_B,
> +	ES_IO_B,
>   	ES_FLAGS
>   };
>   
> @@ -47,11 +48,13 @@ enum {
>   #define EXTENT_STATUS_DELAYED	(1 << ES_DELAYED_B)
>   #define EXTENT_STATUS_HOLE	(1 << ES_HOLE_B)
>   #define EXTENT_STATUS_REFERENCED	(1 << ES_REFERENCED_B)
> +#define EXTENT_STATUS_IO	(1 << ES_IO_B)
>   
>   #define ES_TYPE_MASK	((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
>   			  EXTENT_STATUS_UNWRITTEN | \
>   			  EXTENT_STATUS_DELAYED | \
> -			  EXTENT_STATUS_HOLE) << ES_SHIFT)
> +			  EXTENT_STATUS_HOLE | \
> +			  EXTENT_STATUS_IO) << ES_SHIFT)
>   
>   struct ext4_sb_info;
>   struct ext4_extent;
> @@ -173,6 +176,11 @@ static inline int ext4_es_is_delayed(struct extent_status *es)
>   	return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
>   }
>   
> +static inline int ext4_es_is_under_io(struct extent_status *es)
> +{
> +	return (ext4_es_type(es) & EXTENT_STATUS_IO) != 0;
> +}
> +
>   static inline int ext4_es_is_hole(struct extent_status *es)
>   {
>   	return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 22a9d8159720..516966197257 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -704,6 +704,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
>   		    ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
>   				       map->m_lblk + map->m_len - 1))
>   			status |= EXTENT_STATUS_DELAYED;
> +		/*
> +		 * track unwritten extent under io. when io completes, we'll
> +		 * convert unwritten extent to written, ext4_es_insert_extent()
> +		 * will be called again to insert this written extent, then
> +		 * EXTENT_STATUS_IO will be cleared automatically, see remove
> +		 * logic in ext4_es_insert_extent().
> +		 */
> +		if ((status & EXTENT_STATUS_UNWRITTEN) && (flags &
> +		    EXT4_GET_BLOCKS_IO_SUBMIT))
> +			status |= EXTENT_STATUS_IO;
>   		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
>   					    map->m_pblk, status);
>   		if (ret < 0) {
>
Theodore Ts'o Dec. 10, 2018, 5:53 a.m. | #3
On Sun, Nov 25, 2018 at 05:06:41PM +0800, Xiaoguang Wang wrote:
> Hi,
> 
> There will be no [PATCH 2/2], which was caused by my mistake, sorry, only one patch in this mail.
> I have run xfstests "-g auto" for this patch, most test cases work well, such
> cases will fail:
> generic/107 generic/223 generic/347 generic/388 generic/422 generic/475 generic/484, but
> even though I do not apply this patch, these cases still may fail.
> 
> No matter whether this patch is applied, generic/107, generic/388 and generic/475 sometimes pass,
> sometimes fail, other 4 test cases always fail.
> 
> Apply LiuBo's "Ext4: fix slow writeback under dioread_nolock and nodelalloc" and this patch, this
> slow writeback issue described in Liu Bo's patch will go.

Hi, my apologies for not responding earlier.  I had been
waiting/hoping that Eric Whitney, who has last made a lot of changes
around the extent status tree management, would take a look at your
changes and comment.

The other thing which I'm very much wondering about is why you are
seeing such a large number of test failures.  What sort of file system
configuration and kernel version are you seeing these failures?  My
current test runs aren't showing anything like this --- and neeither
is Eric Whitney's testing.  I've attached a recent test report below.

					- Ted

TESTRUNID: ltm-20181208020855
KERNEL:    kernel 4.20.0-rc4-xfstests-00010-ge647e29196b7 #772 SMP Wed Dec 5 19:38:26 EST 2018 x86_64
CMDLINE:   full --kernel gs://gce-xfstests/bzImage
CPUS:      2
MEM:       7680

ext4/4k: 444 tests, 2 failures, 42 skipped, 4272 seconds
  Failures: ext4/034 generic/388 
ext4/1k: 455 tests, 4 failures, 54 skipped, 4624 seconds
  Failures: ext4/034 generic/383 generic/388 generic/454 
ext4/ext3: 503 tests, 3 failures, 104 skipped, 3857 seconds
  Failures: ext4/034 generic/235 generic/388 
ext4/encrypt: 512 tests, 1 failures, 123 skipped, 2828 seconds
  Failures: ext4/034 
ext4/nojournal: 494 tests, 2 failures, 95 skipped, 3407 seconds
  Failures: ext4/301 generic/113 
ext4/ext3conv: 443 tests, 2 failures, 42 skipped, 4240 seconds
  Failures: ext4/034 generic/388 
ext4/adv: 448 tests, 5 failures, 48 skipped, 4165 seconds
  Failures: ext4/034 generic/388 generic/399 generic/477 generic/519 
ext4/dioread_nolock: 443 tests, 2 failures, 42 skipped, 4424 seconds
  Failures: ext4/034 generic/388 
ext4/data_journal: 490 tests, 4 failures, 90 skipped, 4954 seconds
  Failures: ext4/034 generic/371 generic/388 generic/475 
ext4/bigalloc: 429 tests, 6 failures, 49 skipped, 4957 seconds
  Failures: ext4/034 generic/204 generic/219 generic/273 generic/388 
    generic/500 
ext4/bigalloc_1k: 443 tests, 7 failures, 63 skipped, 3717 seconds
  Failures: ext4/034 generic/204 generic/273 generic/383 generic/388 
    generic/454 generic/500 
Totals: 4352 tests, 752 skipped, 38 failures, 0 errors, 45223s

FSTESTIMG: gce-xfstests/xfstests-201812071306
FSTESTPRJ: gce-xfstests
FSTESTVER: blktests b237a09 (Mon, 26 Nov 2018 11:35:33 -0800)
FSTESTVER: fio  fio-3.2 (Fri, 3 Nov 2017 15:23:49 -0600)
FSTESTVER: fsverity bdebc45 (Wed, 5 Sep 2018 21:32:22 -0700)
FSTESTVER: ima-evm-utils 0267fa1 (Mon, 3 Dec 2018 06:11:35 -0500)
FSTESTVER: quota  59b280e (Mon, 5 Feb 2018 16:48:22 +0100)
FSTESTVER: stress-ng 977ae357 (Wed, 6 Sep 2017 23:45:03 -0400)
FSTESTVER: syzkaller 4b6d14f2 (Tue, 27 Nov 2018 13:16:46 +0100)
FSTESTVER: xfsprogs v4.19.0 (Fri, 9 Nov 2018 14:31:04 -0600)
FSTESTVER: xfstests-bld c07ca47 (Fri, 7 Dec 2018 12:56:06 -0500)
FSTESTVER: xfstests linux-v3.8-2234-g8636a571 (Fri, 7 Dec 2018 12:59:13 -0500)
FSTESTSET: -g auto
FSTESTOPT: aex
Jan Kara Dec. 10, 2018, 4:17 p.m. | #4
On Sun 25-11-18 16:50:31, Xiaoguang Wang wrote:
> Currently in ext4_can_extents_be_merged(), if one file has unwritten
> extents under io, we will not merge any other unwritten extents, even
> they are not in range of those unwritten extents under io. This limit
> is coarse, indeed we can merge these unwritten extents that are not
> under io.
> 
> Here add a new ES_IO_B flag to track unwritten extents under io in
> extents status tree. When we try to merge unwritten extents, search
> given extents in extents status tree, if not found, then we can merge
> these unwritten extents.
> 
> Note currently we only track unwritten extents under io.

Thanks for the patch.
 
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 240b6dea5441..a93378cd1152 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -1713,6 +1713,33 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
>  	return err;
>  }
>  
> +static int ext4_unwritten_extent_under_io(struct inode *inode,
> +			struct ext4_extent *ex1, struct ext4_extent *ex2)
> +{

What if this took just starting block and length? There's no big point in
passing two extents here...

> +	unsigned short len;
> +
> +	/*
> +	 * The check for IO to unwritten extent is somewhat racy as we
> +	 * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
> +	 * dropping i_data_sem. But reserved blocks should save us in that
> +	 * case.
> +	 */
> +	if (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)
> +		return 0;
> +
> +	len = ext4_ext_get_actual_len(ex1);
> +	if (ext4_es_scan_range(inode, &ext4_es_is_under_io, ex1->ee_block,
> +	    ex1->ee_block + len - 1))
> +		return 1;
> +
> +	len = ext4_ext_get_actual_len(ex2);
> +	if (ext4_es_scan_range(inode, &ext4_es_is_under_io, ex2->ee_block,
> +	    ex2->ee_block + len - 1))
> +		return 1;
> +
> +	return 0;
> +}
> +
>  int
>  ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
>  				struct ext4_extent *ex2)
> @@ -1744,7 +1771,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
>  	 */
>  	if (ext4_ext_is_unwritten(ex1) &&
>  	    (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
> -	     atomic_read(&EXT4_I(inode)->i_unwritten) ||
> +	    ext4_unwritten_extent_under_io(inode, ex1, ex2) ||
>  	     (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))

I'd check ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN before
ext4_unwritten_extent_under_io() as that is a cheaper check. Also we know
that extents are adjacent so we can just call:

	ext4_unwritten_extent_under_io(inode, le32_to_cpu(ex1->ee_block),
					ext1_ee_len + ext2_ee_len)

and save one extent status tree lookup & iteration.

> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 22a9d8159720..516966197257 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -704,6 +704,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
>  		    ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
>  				       map->m_lblk + map->m_len - 1))
>  			status |= EXTENT_STATUS_DELAYED;
> +		/*
> +		 * track unwritten extent under io. when io completes, we'll
                   ^ capital T                      ^ capital W

> +		 * convert unwritten extent to written, ext4_es_insert_extent()
> +		 * will be called again to insert this written extent, then
> +		 * EXTENT_STATUS_IO will be cleared automatically, see remove
> +		 * logic in ext4_es_insert_extent().
> +		 */
> +		if ((status & EXTENT_STATUS_UNWRITTEN) && (flags &
> +		    EXT4_GET_BLOCKS_IO_SUBMIT))
> +			status |= EXTENT_STATUS_IO;
>  		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
>  					    map->m_pblk, status);
>  		if (ret < 0) {

OK, but you fail to clear EXTENT_STATUS_IO if we fail to submit IO for some
reason or if the IO ends with IO error, don't you? I guess for these error
cases you can just iterate through all the range covered by ioend and clear
EXTENT_STATUS_IO bits. We don't care about performance in that case and it
is the simplest solution I see.

								Honza
Xiaoguang Wang Dec. 12, 2018, 1:47 p.m. | #5
hi,

> On Sun, Nov 25, 2018 at 05:06:41PM +0800, Xiaoguang Wang wrote:
>> Hi,
>>
>> There will be no [PATCH 2/2], which was caused by my mistake, sorry, only one patch in this mail.
>> I have run xfstests "-g auto" for this patch, most test cases work well, such
>> cases will fail:
>> generic/107 generic/223 generic/347 generic/388 generic/422 generic/475 generic/484, but
>> even though I do not apply this patch, these cases still may fail.
>>
>> No matter whether this patch is applied, generic/107, generic/388 and generic/475 sometimes pass,
>> sometimes fail, other 4 test cases always fail.
>>
>> Apply LiuBo's "Ext4: fix slow writeback under dioread_nolock and nodelalloc" and this patch, this
>> slow writeback issue described in Liu Bo's patch will go.
> 
> Hi, my apologies for not responding earlier.  I had been
> waiting/hoping that Eric Whitney, who has last made a lot of changes
> around the extent status tree management, would take a look at your
> changes and comment.
I see, that's ok.

> 
> The other thing which I'm very much wondering about is why you are
> seeing such a large number of test failures.  What sort of file system
> configuration and kernel version are you seeing these failures?  My
> current test runs aren't showing anything like this --- and neeither
> is Eric Whitney's testing.  I've attached a recent test report below.
I run a new tests against newest upstream kernel without my patches:
ext4/034 ext4/305 generic/223 generic/371 generic/422 generic/484 shared/004 failed,

ext4/034: liubo has sent a patch to fix this bug.
ext/305 shared/004: umount failed, but maybe my environment issuse.
Look like other 4 test cases maybe related to fs or test cases bugs.
I attach test results here, later when I have some free time, I'll look into them.

Regards,
Xiaoguang Wang
[lege@localhost xfstests-dev]$ sudo ./check ext4/034 ext4/305 generic/223 generic/371 generic/422 generic/484 shared/004
FSTYP         -- ext4
PLATFORM      -- Linux/x86_64 localhost 4.20.0-rc6+
MKFS_OPTIONS  -- /dev/sda6
MOUNT_OPTIONS -- -o dioread_nolock,nodelalloc /dev/sda6 /mnt/scratch

ext4/034	_check_generic_filesystem: filesystem on /dev/sda6 is inconsistent
(see /home/lege/xfstests-dev/results//ext4/034.full for details)

ext4/305 2s ... - output mismatch (see /home/lege/xfstests-dev/results//ext4/305.out.bad)
     --- tests/ext4/305.out	2018-11-13 17:38:29.682879168 +0800
     +++ /home/lege/xfstests-dev/results//ext4/305.out.bad	2018-12-12 21:31:00.674010908 +0800
     @@ -1,2 +1,4 @@
      QA output created by 305
      Silence is golden
     +umount: /mnt/scratch: target is busy.
     +mount: /mnt/scratch: /dev/sda6 already mounted on /mnt/scratch.
     ...
     (Run 'diff -u /home/lege/xfstests-dev/tests/ext4/305.out /home/lege/xfstests-dev/results//ext4/305.out.bad'  to see the entire diff)
generic/223 9s ... - output mismatch (see /home/lege/xfstests-dev/results//generic/223.out.bad)
     --- tests/generic/223.out	2018-11-13 17:38:29.701879505 +0800
     +++ /home/lege/xfstests-dev/results//generic/223.out.bad	2018-12-12 21:31:17.689483781 +0800
     @@ -204,47 +204,47 @@
      SCRATCH_MNT/file-1-131072-falloc: well-aligned
      SCRATCH_MNT/file-1-131072-write: well-aligned
      SCRATCH_MNT/file-2-131072-falloc: well-aligned
     -SCRATCH_MNT/file-2-131072-write: well-aligned
     +SCRATCH_MNT/file-2-131072-write: Start block 34320 not multiple of sunit 32
      SCRATCH_MNT/file-3-131072-falloc: well-aligned
      SCRATCH_MNT/file-3-131072-write: well-aligned
     ...
     (Run 'diff -u /home/lege/xfstests-dev/tests/generic/223.out /home/lege/xfstests-dev/results//generic/223.out.bad'  to see the entire diff)
generic/371 13s ... - output mismatch (see /home/lege/xfstests-dev/results//generic/371.out.bad)
     --- tests/generic/371.out	2018-11-13 17:38:29.712879700 +0800
     +++ /home/lege/xfstests-dev/results//generic/371.out.bad	2018-12-12 21:31:31.390059348 +0800
     @@ -1,2 +1,6 @@
      QA output created by 371
      Silence is golden
     +pwrite: No space left on device
     +pwrite: No space left on device
     +pwrite: No space left on device
     +pwrite: No space left on device
     ...
     (Run 'diff -u /home/lege/xfstests-dev/tests/generic/371.out /home/lege/xfstests-dev/results//generic/371.out.bad'  to see the entire diff)
generic/422 1s ... - output mismatch (see /home/lege/xfstests-dev/results//generic/422.out.bad)
     --- tests/generic/422.out	2018-11-13 17:38:29.717879788 +0800
     +++ /home/lege/xfstests-dev/results//generic/422.out.bad	2018-12-12 21:31:33.022008791 +0800
     @@ -17,4 +17,4 @@
      XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
      wrote 65536/65536 bytes at offset 0
      XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
     -Space used before and after writeback is equal
     +Files /mnt/scratch/422.before and /mnt/scratch/422.after differ
     ...
     (Run 'diff -u /home/lege/xfstests-dev/tests/generic/422.out /home/lege/xfstests-dev/results//generic/422.out.bad'  to see the entire diff)
generic/484	- output mismatch (see /home/lege/xfstests-dev/results//generic/484.out.bad)
     --- tests/generic/484.out	2018-11-13 17:38:29.721879859 +0800
     +++ /home/lege/xfstests-dev/results//generic/484.out.bad	2018-12-12 21:31:33.307999931 +0800
     @@ -1,2 +1,3 @@
      QA output created by 484
     +record lock is not preserved across execve(2)
      Silence is golden
     ...
     (Run 'diff -u /home/lege/xfstests-dev/tests/generic/484.out /home/lege/xfstests-dev/results//generic/484.out.bad'  to see the entire diff)
shared/004	[failed, exit status 1]- output mismatch (see /home/lege/xfstests-dev/results//shared/004.out.bad)
     --- tests/shared/004.out	2018-11-13 17:38:29.729880001 +0800
     +++ /home/lege/xfstests-dev/results//shared/004.out.bad	2018-12-12 21:31:33.718987199 +0800
     @@ -1,2 +1,6 @@
      QA output created by 004
      Silence is golden
     +umount: /mnt/scratch: target is busy.
     +mount: /mnt/scratch: /dev/sda6 already mounted on /mnt/scratch.
     +mount failed
     +(see /home/lege/xfstests-dev/results//shared/004.full for details)
     ...
     (Run 'diff -u /home/lege/xfstests-dev/tests/shared/004.out /home/lege/xfstests-dev/results//shared/004.out.bad'  to see the entire diff)
Ran: ext4/034 ext4/305 generic/223 generic/371 generic/422 generic/484 shared/004
Failures: ext4/034 ext4/305 generic/223 generic/371 generic/422 generic/484 shared/004
Failed 7 of 7 tests




> 
> 					- Ted
> 
> TESTRUNID: ltm-20181208020855
> KERNEL:    kernel 4.20.0-rc4-xfstests-00010-ge647e29196b7 #772 SMP Wed Dec 5 19:38:26 EST 2018 x86_64
> CMDLINE:   full --kernel gs://gce-xfstests/bzImage
> CPUS:      2
> MEM:       7680
> 
> ext4/4k: 444 tests, 2 failures, 42 skipped, 4272 seconds
>    Failures: ext4/034 generic/388
> ext4/1k: 455 tests, 4 failures, 54 skipped, 4624 seconds
>    Failures: ext4/034 generic/383 generic/388 generic/454
> ext4/ext3: 503 tests, 3 failures, 104 skipped, 3857 seconds
>    Failures: ext4/034 generic/235 generic/388
> ext4/encrypt: 512 tests, 1 failures, 123 skipped, 2828 seconds
>    Failures: ext4/034
> ext4/nojournal: 494 tests, 2 failures, 95 skipped, 3407 seconds
>    Failures: ext4/301 generic/113
> ext4/ext3conv: 443 tests, 2 failures, 42 skipped, 4240 seconds
>    Failures: ext4/034 generic/388
> ext4/adv: 448 tests, 5 failures, 48 skipped, 4165 seconds
>    Failures: ext4/034 generic/388 generic/399 generic/477 generic/519
> ext4/dioread_nolock: 443 tests, 2 failures, 42 skipped, 4424 seconds
>    Failures: ext4/034 generic/388
> ext4/data_journal: 490 tests, 4 failures, 90 skipped, 4954 seconds
>    Failures: ext4/034 generic/371 generic/388 generic/475
> ext4/bigalloc: 429 tests, 6 failures, 49 skipped, 4957 seconds
>    Failures: ext4/034 generic/204 generic/219 generic/273 generic/388
>      generic/500
> ext4/bigalloc_1k: 443 tests, 7 failures, 63 skipped, 3717 seconds
>    Failures: ext4/034 generic/204 generic/273 generic/383 generic/388
>      generic/454 generic/500
> Totals: 4352 tests, 752 skipped, 38 failures, 0 errors, 45223s
> 
> FSTESTIMG: gce-xfstests/xfstests-201812071306
> FSTESTPRJ: gce-xfstests
> FSTESTVER: blktests b237a09 (Mon, 26 Nov 2018 11:35:33 -0800)
> FSTESTVER: fio  fio-3.2 (Fri, 3 Nov 2017 15:23:49 -0600)
> FSTESTVER: fsverity bdebc45 (Wed, 5 Sep 2018 21:32:22 -0700)
> FSTESTVER: ima-evm-utils 0267fa1 (Mon, 3 Dec 2018 06:11:35 -0500)
> FSTESTVER: quota  59b280e (Mon, 5 Feb 2018 16:48:22 +0100)
> FSTESTVER: stress-ng 977ae357 (Wed, 6 Sep 2017 23:45:03 -0400)
> FSTESTVER: syzkaller 4b6d14f2 (Tue, 27 Nov 2018 13:16:46 +0100)
> FSTESTVER: xfsprogs v4.19.0 (Fri, 9 Nov 2018 14:31:04 -0600)
> FSTESTVER: xfstests-bld c07ca47 (Fri, 7 Dec 2018 12:56:06 -0500)
> FSTESTVER: xfstests linux-v3.8-2234-g8636a571 (Fri, 7 Dec 2018 12:59:13 -0500)
> FSTESTSET: -g auto
> FSTESTOPT: aex
>
Xiaoguang Wang Dec. 12, 2018, 2:02 p.m. | #6
> On Sun 25-11-18 16:50:31, Xiaoguang Wang wrote:
>> Currently in ext4_can_extents_be_merged(), if one file has unwritten
>> extents under io, we will not merge any other unwritten extents, even
>> they are not in range of those unwritten extents under io. This limit
>> is coarse, indeed we can merge these unwritten extents that are not
>> under io.
>>
>> Here add a new ES_IO_B flag to track unwritten extents under io in
>> extents status tree. When we try to merge unwritten extents, search
>> given extents in extents status tree, if not found, then we can merge
>> these unwritten extents.
>>
>> Note currently we only track unwritten extents under io.
> 
> Thanks for the patch.
>   
>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
>> index 240b6dea5441..a93378cd1152 100644
>> --- a/fs/ext4/extents.c
>> +++ b/fs/ext4/extents.c
>> @@ -1713,6 +1713,33 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
>>   	return err;
>>   }
>>   
>> +static int ext4_unwritten_extent_under_io(struct inode *inode,
>> +			struct ext4_extent *ex1, struct ext4_extent *ex2)
>> +{
> 
> What if this took just starting block and length? There's no big point in
> passing two extents here...
> 
>> +	unsigned short len;
>> +
>> +	/*
>> +	 * The check for IO to unwritten extent is somewhat racy as we
>> +	 * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
>> +	 * dropping i_data_sem. But reserved blocks should save us in that
>> +	 * case.
>> +	 */
>> +	if (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)
>> +		return 0;
>> +
>> +	len = ext4_ext_get_actual_len(ex1);
>> +	if (ext4_es_scan_range(inode, &ext4_es_is_under_io, ex1->ee_block,
>> +	    ex1->ee_block + len - 1))
>> +		return 1;
>> +
>> +	len = ext4_ext_get_actual_len(ex2);
>> +	if (ext4_es_scan_range(inode, &ext4_es_is_under_io, ex2->ee_block,
>> +	    ex2->ee_block + len - 1))
>> +		return 1;
>> +
>> +	return 0;
>> +}
>> +
>>   int
>>   ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
>>   				struct ext4_extent *ex2)
>> @@ -1744,7 +1771,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
>>   	 */
>>   	if (ext4_ext_is_unwritten(ex1) &&
>>   	    (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
>> -	     atomic_read(&EXT4_I(inode)->i_unwritten) ||
>> +	    ext4_unwritten_extent_under_io(inode, ex1, ex2) ||
>>   	     (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
> 
> I'd check ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN before
> ext4_unwritten_extent_under_io() as that is a cheaper check. Also we know
> that extents are adjacent so we can just call:
> 
> 	ext4_unwritten_extent_under_io(inode, le32_to_cpu(ex1->ee_block),
> 					ext1_ee_len + ext2_ee_len)
> 
> and save one extent status tree lookup & iteration.
Your comments are good, thanks, and sorry for my bad codes, I should have realized
this inprovement myself.

> 
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 22a9d8159720..516966197257 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -704,6 +704,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
>>   		    ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
>>   				       map->m_lblk + map->m_len - 1))
>>   			status |= EXTENT_STATUS_DELAYED;
>> +		/*
>> +		 * track unwritten extent under io. when io completes, we'll
>                     ^ capital T                      ^ capital W
> 
>> +		 * convert unwritten extent to written, ext4_es_insert_extent()
>> +		 * will be called again to insert this written extent, then
>> +		 * EXTENT_STATUS_IO will be cleared automatically, see remove
>> +		 * logic in ext4_es_insert_extent().
>> +		 */
>> +		if ((status & EXTENT_STATUS_UNWRITTEN) && (flags &
>> +		    EXT4_GET_BLOCKS_IO_SUBMIT))
>> +			status |= EXTENT_STATUS_IO;
>>   		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
>>   					    map->m_pblk, status);
>>   		if (ret < 0) {
> 
> OK, but you fail to clear EXTENT_STATUS_IO if we fail to submit IO for some
> reason or if the IO ends with IO error, don't you? I guess for these error
> cases you can just iterate through all the range covered by ioend and clear
> EXTENT_STATUS_IO bits. We don't care about performance in that case and it
> is the simplest solution I see.
ok, I wrote new patch which will clear this EXTENT_STATUS_IO in mpage_map_and_submit_extent
when there are errors. But for simplicity, I don't write new fucntion to iterate extent
status range, which may need to splilt es into 2 or 3 es, and need to handle memory allocation
failure. I still use ext4_es_insert_extent's feature that removes es firstly and inserts new es
with new status.
After fstests test, I'll send new patch soon, thanks.

Regards,
Xiaougang Wang

> 
> 								Honza
>

Patch

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 240b6dea5441..a93378cd1152 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1713,6 +1713,33 @@  static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
 	return err;
 }
 
+static int ext4_unwritten_extent_under_io(struct inode *inode,
+			struct ext4_extent *ex1, struct ext4_extent *ex2)
+{
+	unsigned short len;
+
+	/*
+	 * The check for IO to unwritten extent is somewhat racy as we
+	 * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
+	 * dropping i_data_sem. But reserved blocks should save us in that
+	 * case.
+	 */
+	if (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)
+		return 0;
+
+	len = ext4_ext_get_actual_len(ex1);
+	if (ext4_es_scan_range(inode, &ext4_es_is_under_io, ex1->ee_block,
+	    ex1->ee_block + len - 1))
+		return 1;
+
+	len = ext4_ext_get_actual_len(ex2);
+	if (ext4_es_scan_range(inode, &ext4_es_is_under_io, ex2->ee_block,
+	    ex2->ee_block + len - 1))
+		return 1;
+
+	return 0;
+}
+
 int
 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 				struct ext4_extent *ex2)
@@ -1744,7 +1771,7 @@  ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 	 */
 	if (ext4_ext_is_unwritten(ex1) &&
 	    (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
-	     atomic_read(&EXT4_I(inode)->i_unwritten) ||
+	    ext4_unwritten_extent_under_io(inode, ex1, ex2) ||
 	     (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
 		return 0;
 #ifdef AGGRESSIVE_TEST
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 2b439afafe13..04bbd8b7f8f1 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1332,6 +1332,15 @@  static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
 		 */
 		if (ext4_es_is_delayed(es))
 			goto next;
+
+		/*
+		 * we don't reclaim unwritten extent under io because we use
+		 * it to check whether we can merge other unwritten extents
+		 * who are not under io, and when io completes, then we can
+		 * reclaim this extent.
+		 */
+		if (ext4_es_is_under_io(es))
+			goto next;
 		if (ext4_es_is_referenced(es)) {
 			ext4_es_clear_referenced(es);
 			goto next;
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 131a8b7df265..29a985600a47 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -36,6 +36,7 @@  enum {
 	ES_DELAYED_B,
 	ES_HOLE_B,
 	ES_REFERENCED_B,
+	ES_IO_B,
 	ES_FLAGS
 };
 
@@ -47,11 +48,13 @@  enum {
 #define EXTENT_STATUS_DELAYED	(1 << ES_DELAYED_B)
 #define EXTENT_STATUS_HOLE	(1 << ES_HOLE_B)
 #define EXTENT_STATUS_REFERENCED	(1 << ES_REFERENCED_B)
+#define EXTENT_STATUS_IO	(1 << ES_IO_B)
 
 #define ES_TYPE_MASK	((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
 			  EXTENT_STATUS_UNWRITTEN | \
 			  EXTENT_STATUS_DELAYED | \
-			  EXTENT_STATUS_HOLE) << ES_SHIFT)
+			  EXTENT_STATUS_HOLE | \
+			  EXTENT_STATUS_IO) << ES_SHIFT)
 
 struct ext4_sb_info;
 struct ext4_extent;
@@ -173,6 +176,11 @@  static inline int ext4_es_is_delayed(struct extent_status *es)
 	return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
 }
 
+static inline int ext4_es_is_under_io(struct extent_status *es)
+{
+	return (ext4_es_type(es) & EXTENT_STATUS_IO) != 0;
+}
+
 static inline int ext4_es_is_hole(struct extent_status *es)
 {
 	return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 22a9d8159720..516966197257 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -704,6 +704,16 @@  int ext4_map_blocks(handle_t *handle, struct inode *inode,
 		    ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
 				       map->m_lblk + map->m_len - 1))
 			status |= EXTENT_STATUS_DELAYED;
+		/*
+		 * track unwritten extent under io. when io completes, we'll
+		 * convert unwritten extent to written, ext4_es_insert_extent()
+		 * will be called again to insert this written extent, then
+		 * EXTENT_STATUS_IO will be cleared automatically, see remove
+		 * logic in ext4_es_insert_extent().
+		 */
+		if ((status & EXTENT_STATUS_UNWRITTEN) && (flags &
+		    EXT4_GET_BLOCKS_IO_SUBMIT))
+			status |= EXTENT_STATUS_IO;
 		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
 					    map->m_pblk, status);
 		if (ret < 0) {