[2/2] ext4: simulate various I/O and checksum errors when reading metadata
diff mbox series

Message ID 20191121183036.29385-2-tytso@mit.edu
State Accepted
Headers show
Series
  • [1/2] ext4: save the error code which triggered an ext4_error() in the superblock
Related show

Commit Message

Theodore Y. Ts'o Nov. 21, 2019, 6:30 p.m. UTC
This allows us to test various error handling code paths

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/balloc.c |  4 +++-
 fs/ext4/ext4.h   | 42 ++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/ialloc.c |  4 +++-
 fs/ext4/inode.c  |  6 +++++-
 fs/ext4/namei.c  | 11 ++++++++---
 fs/ext4/sysfs.c  | 23 +++++++++++++++++++++++
 6 files changed, 84 insertions(+), 6 deletions(-)

Comments

Darrick J. Wong Nov. 22, 2019, 12:09 a.m. UTC | #1
On Thu, Nov 21, 2019 at 01:30:36PM -0500, Theodore Ts'o wrote:
> This allows us to test various error handling code paths
> 
> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
> ---
>  fs/ext4/balloc.c |  4 +++-
>  fs/ext4/ext4.h   | 42 ++++++++++++++++++++++++++++++++++++++++++
>  fs/ext4/ialloc.c |  4 +++-
>  fs/ext4/inode.c  |  6 +++++-
>  fs/ext4/namei.c  | 11 ++++++++---
>  fs/ext4/sysfs.c  | 23 +++++++++++++++++++++++
>  6 files changed, 84 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index 102c38527a10..5f993a411251 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -371,7 +371,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
>  	if (buffer_verified(bh))
>  		goto verified;
>  	if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
> -			desc, bh))) {
> +						    desc, bh) ||
> +		     ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) {
>  		ext4_unlock_group(sb, block_group);
>  		ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
>  		ext4_mark_group_bitmap_corrupted(sb, block_group,
> @@ -505,6 +506,7 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
>  	if (!desc)
>  		return -EFSCORRUPTED;
>  	wait_on_buffer(bh);
> +	ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO);
>  	if (!buffer_uptodate(bh)) {
>  		ext4_set_errno(sb, EIO);
>  		ext4_error(sb, "Cannot read block bitmap - "
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 1c9ac0fc8715..e6798db4634c 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1557,6 +1557,9 @@ struct ext4_sb_info {
>  	/* Barrier between changing inodes' journal flags and writepages ops. */
>  	struct percpu_rw_semaphore s_journal_flag_rwsem;
>  	struct dax_device *s_daxdev;
> +#ifdef CONFIG_EXT4_DEBUG
> +	unsigned long s_simulate_fail;
> +#endif
>  };
>  
>  static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
> @@ -1575,6 +1578,45 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
>  		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
>  }
>  
> +static inline int ext4_simulate_fail(struct super_block *sb,
> +				     unsigned long flag)

Nit: bool?

> +{
> +#ifdef CONFIG_EXT4_DEBUG
> +	unsigned long old, new;
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +
> +	do {
> +		old = READ_ONCE(sbi->s_simulate_fail);
> +		if (likely((old & flag) == 0))
> +			return 0;
> +		new = old & ~flag;
> +	} while (unlikely(cmpxchg(&sbi->s_simulate_fail, old, new) != old));

If I'm reading this correctly, this means that userspace sets a
s_simulate_fail bit via sysfs knob, and the next time the filesystem
calls ext4_simulate_fail with the same bit set in @flag we'll return
true to say "simulate the failure" and clear the bit in s_simulate_fail?

IOWs, the simulated failures have to be re-armed every time?

Seems reasonable, but consider the possibility that in the future it
might be useful if you could set up periodic failures (e.g. directory
lookups fail 10% of the time) so that you can see how something like
fsstress reacts to less-predictable failures?

Of course that also increases the amount of fugly sysfs boilerplate so
that each knob can have its own sysfs file... that alone is half of a
reason not to do that. :(

--D

> +	return 1;
> +#else
> +	return 0;
> +#endif
> +}
> +
> +static inline void ext4_simulate_fail_bh(struct super_block *sb,
> +					 struct buffer_head *bh,
> +					 unsigned long flag)
> +{
> +	if (!IS_ERR(bh) && ext4_simulate_fail(sb, flag))
> +		clear_buffer_uptodate(bh);
> +}
> +
> +/*
> + * Simulate_fail flags
> + */
> +#define EXT4_SIM_BBITMAP_EIO	0x00000001
> +#define EXT4_SIM_BBITMAP_CRC	0x00000002
> +#define EXT4_SIM_IBITMAP_EIO	0x00000004
> +#define EXT4_SIM_IBITMAP_CRC	0x00000008
> +#define EXT4_SIM_INODE_EIO	0x00000010
> +#define EXT4_SIM_INODE_CRC	0x00000020
> +#define EXT4_SIM_DIRBLOCK_EIO	0x00000040
> +#define EXT4_SIM_DIRBLOCK_CRC	0x00000080
> +
>  /*
>   * Error number codes for s_{first,last}_error_errno
>   *
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index 31a5fd6f5e6a..f7033594f4a0 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -94,7 +94,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
>  		goto verified;
>  	blk = ext4_inode_bitmap(sb, desc);
>  	if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
> -					   EXT4_INODES_PER_GROUP(sb) / 8)) {
> +					   EXT4_INODES_PER_GROUP(sb) / 8) ||
> +	    ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) {
>  		ext4_unlock_group(sb, block_group);
>  		ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
>  			   "inode_bitmap = %llu", block_group, blk);
> @@ -192,6 +193,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
>  	get_bh(bh);
>  	submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
>  	wait_on_buffer(bh);
> +	ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO);
>  	if (!buffer_uptodate(bh)) {
>  		put_bh(bh);
>  		ext4_set_errno(sb, EIO);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index b67ffa24ae0a..564f88040de9 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4243,6 +4243,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
>  	bh = sb_getblk(sb, block);
>  	if (unlikely(!bh))
>  		return -ENOMEM;
> +	if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO))
> +		goto simulate_eio;
>  	if (!buffer_uptodate(bh)) {
>  		lock_buffer(bh);
>  
> @@ -4341,6 +4343,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
>  		blk_finish_plug(&plug);
>  		wait_on_buffer(bh);
>  		if (!buffer_uptodate(bh)) {
> +		simulate_eio:
>  			ext4_set_errno(inode->i_sb, EIO);
>  			EXT4_ERROR_INODE_BLOCK(inode, block,
>  					       "unable to read itable block");
> @@ -4555,7 +4558,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
>  					      sizeof(gen));
>  	}
>  
> -	if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
> +	if (!ext4_inode_csum_verify(inode, raw_inode, ei) ||
> +	    ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) {
>  		ext4_set_errno(inode->i_sb, EFSBADCRC);
>  		ext4_error_inode(inode, function, line, 0,
>  				 "iget: checksum invalid");
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index cee7c28e070d..be14e33e5103 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -109,7 +109,10 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
>  	struct ext4_dir_entry *dirent;
>  	int is_dx_block = 0;
>  
> -	bh = ext4_bread(NULL, inode, block, 0);
> +	if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO))
> +		bh = ERR_PTR(-EIO);
> +	else
> +		bh = ext4_bread(NULL, inode, block, 0);
>  	if (IS_ERR(bh)) {
>  		__ext4_warning(inode->i_sb, func, line,
>  			       "inode #%lu: lblock %lu: comm %s: "
> @@ -153,7 +156,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
>  	 * caller is sure it should be an index block.
>  	 */
>  	if (is_dx_block && type == INDEX) {
> -		if (ext4_dx_csum_verify(inode, dirent))
> +		if (ext4_dx_csum_verify(inode, dirent) &&
> +		    !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
>  			set_buffer_verified(bh);
>  		else {
>  			ext4_set_errno(inode->i_sb, EFSBADCRC);
> @@ -164,7 +168,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
>  		}
>  	}
>  	if (!is_dx_block) {
> -		if (ext4_dirblock_csum_verify(inode, bh))
> +		if (ext4_dirblock_csum_verify(inode, bh) &&
> +		    !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
>  			set_buffer_verified(bh);
>  		else {
>  			ext4_set_errno(inode->i_sb, EFSBADCRC);
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index eb1efad0e20a..a990d28d191b 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -29,6 +29,7 @@ typedef enum {
>  	attr_last_error_time,
>  	attr_feature,
>  	attr_pointer_ui,
> +	attr_pointer_ul,
>  	attr_pointer_atomic,
>  	attr_journal_task,
>  } attr_id_t;
> @@ -160,6 +161,9 @@ static struct ext4_attr ext4_attr_##_name = {			\
>  #define EXT4_RW_ATTR_SBI_UI(_name,_elname)	\
>  	EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)
>  
> +#define EXT4_RW_ATTR_SBI_UL(_name,_elname)	\
> +	EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname)
> +
>  #define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
>  static struct ext4_attr ext4_attr_##_name = {			\
>  	.attr = {.name = __stringify(_name), .mode = _mode },	\
> @@ -194,6 +198,9 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int
>  EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
>  EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
>  EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
> +#ifdef CONFIG_EXT4_DEBUG
> +EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
> +#endif
>  EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
>  EXT4_ATTR(first_error_time, 0444, first_error_time);
>  EXT4_ATTR(last_error_time, 0444, last_error_time);
> @@ -228,6 +235,9 @@ static struct attribute *ext4_attrs[] = {
>  	ATTR_LIST(first_error_time),
>  	ATTR_LIST(last_error_time),
>  	ATTR_LIST(journal_task),
> +#ifdef CONFIG_EXT4_DEBUG
> +	ATTR_LIST(simulate_fail),
> +#endif
>  	NULL,
>  };
>  ATTRIBUTE_GROUPS(ext4);
> @@ -318,6 +328,11 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
>  		else
>  			return snprintf(buf, PAGE_SIZE, "%u\n",
>  					*((unsigned int *) ptr));
> +	case attr_pointer_ul:
> +		if (!ptr)
> +			return 0;
> +		return snprintf(buf, PAGE_SIZE, "%lu\n",
> +				*((unsigned long *) ptr));
>  	case attr_pointer_atomic:
>  		if (!ptr)
>  			return 0;
> @@ -361,6 +376,14 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
>  		else
>  			*((unsigned int *) ptr) = t;
>  		return len;
> +	case attr_pointer_ul:
> +		if (!ptr)
> +			return 0;
> +		ret = kstrtoul(skip_spaces(buf), 0, &t);
> +		if (ret)
> +			return ret;
> +		*((unsigned long *) ptr) = t;
> +		return len;
>  	case attr_inode_readahead:
>  		return inode_readahead_blks_store(sbi, buf, len);
>  	case attr_trigger_test_error:
> -- 
> 2.23.0
>
Theodore Y. Ts'o Nov. 22, 2019, 1 a.m. UTC | #2
On Thu, Nov 21, 2019 at 04:09:33PM -0800, Darrick J. Wong wrote:
> > +static inline int ext4_simulate_fail(struct super_block *sb,
> > +				     unsigned long flag)
> 
> Nit: bool?

Sure, I'll do this for the next version.

> If I'm reading this correctly, this means that userspace sets a
> s_simulate_fail bit via sysfs knob, and the next time the filesystem
> calls ext4_simulate_fail with the same bit set in @flag we'll return
> true to say "simulate the failure" and clear the bit in s_simulate_fail?
> 
> IOWs, the simulated failures have to be re-armed every time?

Yes, that's correct.

> Seems reasonable, but consider the possibility that in the future it
> might be useful if you could set up periodic failures (e.g. directory
> lookups fail 10% of the time) so that you can see how something like
> fsstress reacts to less-predictable failures?

So in theory, we could do that with dm_flakey --- but that's a pain in
the tuckus, since you have to specify the LBA for the directory blocks
that you might want to have fail.  I implemented this so I could have
a quick and dirty way of testing the first patch in this series (and
in fact, I found a bug in the first version of the previous patch, so
I'm glad I spent the time to implement the test patch :-).

What might be interesting to do is some kind of eBPF hook where we
pass in the block #, inode #, and metadata type, and the ePBF program
could do use a much more complex set of criteria in terms of whether
or not to trigger an EIO, or how to fuzz a particular block to either
force a CRC failure, or to try to find bugs ala Hydra[1] (funded via a
Google Faculty Research Award grant), but using a much more glass-box
style test approach.

[1] https://gts3.org/~sanidhya/pubs/2019/hydra.pdf

This would be a lot more work, and I'm not sufficiently up to speed
with eBPF, and I just needed a quick and dirty testing scheme.

The reason why I think it's worthwhile to land this patch (as opposed
to throwing it away after doing the development work for the previous
patch) is that it's a relatively small set of changes, and all of the
code disappears if CONFIG_DEBUG_EXT4 is not enabled.  So it has no
performance cost on production kernels, and it's highly unlikely that
users would have a reason to use this feature on production use cases,
so ripping this out if and when we have a more functional eBPF testing
infrastructure to replace it shouldn't really be a problem.

					- Ted

P.S.  A fascinating question is whether we could make the hooks for
this hypothetical eBPF hook general enough that it could work for more
than just ext4, but for other file systems.  The problem is that the
fs metadata types are not going to be same across different file
systems, so that makes the API design quite tricky; and perhaps not
worth it?
Darrick J. Wong Nov. 22, 2019, 1:18 a.m. UTC | #3
On Thu, Nov 21, 2019 at 08:00:26PM -0500, Theodore Y. Ts'o wrote:
> On Thu, Nov 21, 2019 at 04:09:33PM -0800, Darrick J. Wong wrote:
> > > +static inline int ext4_simulate_fail(struct super_block *sb,
> > > +				     unsigned long flag)
> > 
> > Nit: bool?
> 
> Sure, I'll do this for the next version.
> 
> > If I'm reading this correctly, this means that userspace sets a
> > s_simulate_fail bit via sysfs knob, and the next time the filesystem
> > calls ext4_simulate_fail with the same bit set in @flag we'll return
> > true to say "simulate the failure" and clear the bit in s_simulate_fail?
> > 
> > IOWs, the simulated failures have to be re-armed every time?
> 
> Yes, that's correct.
> 
> > Seems reasonable, but consider the possibility that in the future it
> > might be useful if you could set up periodic failures (e.g. directory
> > lookups fail 10% of the time) so that you can see how something like
> > fsstress reacts to less-predictable failures?
> 
> So in theory, we could do that with dm_flakey --- but that's a pain in
> the tuckus, since you have to specify the LBA for the directory blocks
> that you might want to have fail.

Funny, I've been working on a fstests helper function to make it easy to
set up dm-flakey based on fiemap/getfsmap output and such. :)

> I implemented this so I could have
> a quick and dirty way of testing the first patch in this series (and
> in fact, I found a bug in the first version of the previous patch, so
> I'm glad I spent the time to implement the test patch :-).

Heh, cool!

> What might be interesting to do is some kind of eBPF hook where we
> pass in the block #, inode #, and metadata type, and the ePBF program
> could do use a much more complex set of criteria in terms of whether
> or not to trigger an EIO, or how to fuzz a particular block to either
> force a CRC failure, or to try to find bugs ala Hydra[1] (funded via a
> Google Faculty Research Award grant), but using a much more glass-box
> style test approach.

That would be fun.  Attach an arbitrary eBPF program to a range of
sectors.  I wonder how loud the howls of protest would be for "can we
let ebpf programs scribble on a kernel io buffer pleeze?"...

...a couple of years ago I sent out an RFCRAP patch so that you could
use eBPF's "new" ability to change function return values, which
Christoph immediately NAKd.  I think Josef's original purpose was so
that he could inject arbitrary debugging knobs all over btrfs.

> [1] https://gts3.org/~sanidhya/pubs/2019/hydra.pdf
> 
> This would be a lot more work, and I'm not sufficiently up to speed
> with eBPF, and I just needed a quick and dirty testing scheme.
> 
> The reason why I think it's worthwhile to land this patch (as opposed
> to throwing it away after doing the development work for the previous
> patch) is that it's a relatively small set of changes, and all of the
> code disappears if CONFIG_DEBUG_EXT4 is not enabled.  So it has no
> performance cost on production kernels, and it's highly unlikely that
> users would have a reason to use this feature on production use cases,
> so ripping this out if and when we have a more functional eBPF testing
> infrastructure to replace it shouldn't really be a problem.

Admittedly it's a debug knob so I don't see it as a big deal if you
merge this and some day rip it out or supersede it.  The XFS knobs have
undergone a few, uh, interface revisions.

> 					- Ted
> 
> P.S.  A fascinating question is whether we could make the hooks for
> this hypothetical eBPF hook general enough that it could work for more
> than just ext4, but for other file systems.  The problem is that the
> fs metadata types are not going to be same across different file
> systems, so that makes the API design quite tricky; and perhaps not
> worth it?

Yeah.  I mean, it's eBPF glomming onto random parts of the kernel, so I
don't think there's ever going to be a General API For Brain Slugs[3].

OTOH I need LSF topics so sure lets roll.

--D

[2] https://lore.kernel.org/linux-xfs/20171213061825.GO19219@magnolia/
[3] https://futurama.fandom.com/wiki/Brain_Slug
Theodore Y. Ts'o Nov. 22, 2019, 3:56 a.m. UTC | #4
On Thu, Nov 21, 2019 at 05:18:34PM -0800, Darrick J. Wong wrote:
> > So in theory, we could do that with dm_flakey --- but that's a pain in
> > the tuckus, since you have to specify the LBA for the directory blocks
> > that you might want to have fail.
> 
> Funny, I've been working on a fstests helper function to make it easy to
> set up dm-flakey based on fiemap/getfsmap output and such. :)

Ah, excellent, I'm looking forward to seeing it.  :-)

> > What might be interesting to do is some kind of eBPF hook where we
> > pass in the block #, inode #, and metadata type, and the ePBF program
> > could do use a much more complex set of criteria in terms of whether
> > or not to trigger an EIO, or how to fuzz a particular block to either
> > force a CRC failure, or to try to find bugs ala Hydra[1] (funded via a
> > Google Faculty Research Award grant), but using a much more glass-box
> > style test approach.
> 
> That would be fun.  Attach an arbitrary eBPF program to a range of
> sectors.  I wonder how loud the howls of protest would be for "can we
> let ebpf programs scribble on a kernel io buffer pleeze?"...

Well, because the eBPF hook would include the metadata type (an
allocation bitmap vs inode table vs htree index vs htree leaf block,
etc.) what I was thinking about has to be done in ext4 as a
ext4-specific hook.  And it would only be enabled if CONFIG_EXT4_DEBUG
or a separate Kconfig option is enabled --- and I *hope* no
distribution would be so silly/stupid enough to enable it on a product
kernel build.  :-)

						- Ted
Andreas Dilger Nov. 22, 2019, 11:46 p.m. UTC | #5
On Nov 21, 2019, at 5:09 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> 
> On Thu, Nov 21, 2019 at 01:30:36PM -0500, Theodore Ts'o wrote:
>> This allows us to test various error handling code paths
>> 
>> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
>> ---
>> fs/ext4/balloc.c |  4 +++-
>> fs/ext4/ext4.h   | 42 ++++++++++++++++++++++++++++++++++++++++++
>> fs/ext4/ialloc.c |  4 +++-
>> fs/ext4/inode.c  |  6 +++++-
>> fs/ext4/namei.c  | 11 ++++++++---
>> fs/ext4/sysfs.c  | 23 +++++++++++++++++++++++
>> 6 files changed, 84 insertions(+), 6 deletions(-)
>> 
>> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
>> index 102c38527a10..5f993a411251 100644
>> --- a/fs/ext4/balloc.c
>> +++ b/fs/ext4/balloc.c
>> @@ -371,7 +371,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
>> 	if (buffer_verified(bh))
>> 		goto verified;
>> 	if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
>> -			desc, bh))) {
>> +						    desc, bh) ||
>> +		     ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) {
>> 		ext4_unlock_group(sb, block_group);
>> 		ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
>> 		ext4_mark_group_bitmap_corrupted(sb, block_group,
>> @@ -505,6 +506,7 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
>> 	if (!desc)
>> 		return -EFSCORRUPTED;
>> 	wait_on_buffer(bh);
>> +	ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO);
>> 	if (!buffer_uptodate(bh)) {
>> 		ext4_set_errno(sb, EIO);
>> 		ext4_error(sb, "Cannot read block bitmap - "
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 1c9ac0fc8715..e6798db4634c 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -1557,6 +1557,9 @@ struct ext4_sb_info {
>> 	/* Barrier between changing inodes' journal flags and writepages ops. */
>> 	struct percpu_rw_semaphore s_journal_flag_rwsem;
>> 	struct dax_device *s_daxdev;
>> +#ifdef CONFIG_EXT4_DEBUG
>> +	unsigned long s_simulate_fail;
>> +#endif
>> };
>> 
>> static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
>> @@ -1575,6 +1578,45 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
>> 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
>> }
>> 
>> +static inline int ext4_simulate_fail(struct super_block *sb,
>> +				     unsigned long flag)
> 
> Nit: bool?
> 
>> +{
>> +#ifdef CONFIG_EXT4_DEBUG
>> +	unsigned long old, new;
>> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
>> +
>> +	do {
>> +		old = READ_ONCE(sbi->s_simulate_fail);
>> +		if (likely((old & flag) == 0))
>> +			return 0;
>> +		new = old & ~flag;
>> +	} while (unlikely(cmpxchg(&sbi->s_simulate_fail, old, new) != old));
> 
> If I'm reading this correctly, this means that userspace sets a
> s_simulate_fail bit via sysfs knob, and the next time the filesystem
> calls ext4_simulate_fail with the same bit set in @flag we'll return
> true to say "simulate the failure" and clear the bit in s_simulate_fail?
> 
> IOWs, the simulated failures have to be re-armed every time?
> 
> Seems reasonable, but consider the possibility that in the future it
> might be useful if you could set up periodic failures (e.g. directory
> lookups fail 10% of the time) so that you can see how something like
> fsstress reacts to less-predictable failures?
> 
> Of course that also increases the amount of fugly sysfs boilerplate so
> that each knob can have its own sysfs file... that alone is half of a
> reason not to do that. :(

Just for comparison, Lustre has had a fault injection mechanism for ages
that can do a bunch of things like this.  Each fault location has a unique
number (we separate them by subsystems in the code, but numbers are rather
arbitrary), and then a sysfs parameter "fail_loc" that can be set to match
the fault location to inject errors, and "fail_val" that allows userspace
to adjust/tune the failure behavior (e.g. only affect target N, or sleep N
seconds, ...).

The low 16 bits set in fail_loc is the fault location number, and the high
16 bits of fail_loc are flags that can modify the behavior independent of
which failure number is being used:
- CFS_FAIL_ONCE: the fail_loc should only fail once (default forever)
- CFS_FAIL_SKIP: skip the fail_loc the first "fail_val" times
- CFS_FAIL_SOME: trigger the failure the first "fail_val" times
- CFS_FAIL_RAND: trigger the failure at a rate of 1/fail_val

There are also flags set by the kernel when the failure is hit, so it is
possible to read fail_loc in a test script if the failure was already hit.

Internally in the code, the most common use is just checking if we hit the
currently-set fail_loc (which is unlikely() for minimal impact), like:

	if (CFS_FAIL_CHECK(OBD_FAIL_TGT_REPLAY_RECONNECT))
		 RETURN(1);	 /* don't send early reply */

        if (CFS_FAIL_CHECK(OBD_FAIL_FLD_QUERY_REQ) && req->rq_no_delay) {
                /* the same error returned by ptlrpc_import_delay_req() */
                rc = -EWOULDBLOCK;
                req->rq_status = rc;

It is possible to inject a delay into a thread to allow something else to
happen (maybe more useful for a distributed system rather than a local one):

	CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_DELAY2, cfs_fail_val);

It is also possible to set up a race between two threads in the same or
different parts of the code on the same node:

	CFS_RACE(CFS_FAIL_CHLOG_USER_REG_UNREG_RACE);

The first thread to hit this fail_loc will sleep, and the second thread
that hits it will wake it up.  There is a variation of this to make it
explicit that only one thread to hit a location should sleep, and a
second thread needs to hit a different location to wake it up:

	thread1:
	CFS_RACE_WAIT(OBD_FAIL_OBD_ZERO_NLINK_RACE);

				thread2:
				CFS_RACE_WAKEUP(OBD_FAIL_OBD_ZERO_NLINK_RACE);

It is also possible to daisy-chain failure conditions:

	if (ns_is_client(ldlm_lock_to_ns(lock)) &&
	    CFS_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST,
				 OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE))
		ldlm_set_fail_loc(lock);

so here if "OBD_FAIL_LDLM_INTR_CP_AST" is hit, it will reset fail_loc to be "OBD_FAIL_LDLM_CP_BL_RACE" to fail once, flag a particular DLM lock and then
the next two threads that access this lock will race to process it:

	if (ldlm_is_fail_loc(lock))
		CFS_RACE(OBD_FAIL_LDLM_CP_BL_RACE);

The CFS_FAIL functionality has been used for quite a few years and has
proven sufficient and easily used to invoke failure conditions that would
be otherwise impossible to reproduce (over a thousand fault injection
sites in the Lustre code and corresponding tests to trigger them today).

Cheers, Andreas

Patch
diff mbox series

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 102c38527a10..5f993a411251 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -371,7 +371,8 @@  static int ext4_validate_block_bitmap(struct super_block *sb,
 	if (buffer_verified(bh))
 		goto verified;
 	if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
-			desc, bh))) {
+						    desc, bh) ||
+		     ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) {
 		ext4_unlock_group(sb, block_group);
 		ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
 		ext4_mark_group_bitmap_corrupted(sb, block_group,
@@ -505,6 +506,7 @@  int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
 	if (!desc)
 		return -EFSCORRUPTED;
 	wait_on_buffer(bh);
+	ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO);
 	if (!buffer_uptodate(bh)) {
 		ext4_set_errno(sb, EIO);
 		ext4_error(sb, "Cannot read block bitmap - "
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1c9ac0fc8715..e6798db4634c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1557,6 +1557,9 @@  struct ext4_sb_info {
 	/* Barrier between changing inodes' journal flags and writepages ops. */
 	struct percpu_rw_semaphore s_journal_flag_rwsem;
 	struct dax_device *s_daxdev;
+#ifdef CONFIG_EXT4_DEBUG
+	unsigned long s_simulate_fail;
+#endif
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1575,6 +1578,45 @@  static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
 
+static inline int ext4_simulate_fail(struct super_block *sb,
+				     unsigned long flag)
+{
+#ifdef CONFIG_EXT4_DEBUG
+	unsigned long old, new;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	do {
+		old = READ_ONCE(sbi->s_simulate_fail);
+		if (likely((old & flag) == 0))
+			return 0;
+		new = old & ~flag;
+	} while (unlikely(cmpxchg(&sbi->s_simulate_fail, old, new) != old));
+	return 1;
+#else
+	return 0;
+#endif
+}
+
+static inline void ext4_simulate_fail_bh(struct super_block *sb,
+					 struct buffer_head *bh,
+					 unsigned long flag)
+{
+	if (!IS_ERR(bh) && ext4_simulate_fail(sb, flag))
+		clear_buffer_uptodate(bh);
+}
+
+/*
+ * Simulate_fail flags
+ */
+#define EXT4_SIM_BBITMAP_EIO	0x00000001
+#define EXT4_SIM_BBITMAP_CRC	0x00000002
+#define EXT4_SIM_IBITMAP_EIO	0x00000004
+#define EXT4_SIM_IBITMAP_CRC	0x00000008
+#define EXT4_SIM_INODE_EIO	0x00000010
+#define EXT4_SIM_INODE_CRC	0x00000020
+#define EXT4_SIM_DIRBLOCK_EIO	0x00000040
+#define EXT4_SIM_DIRBLOCK_CRC	0x00000080
+
 /*
  * Error number codes for s_{first,last}_error_errno
  *
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 31a5fd6f5e6a..f7033594f4a0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -94,7 +94,8 @@  static int ext4_validate_inode_bitmap(struct super_block *sb,
 		goto verified;
 	blk = ext4_inode_bitmap(sb, desc);
 	if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
-					   EXT4_INODES_PER_GROUP(sb) / 8)) {
+					   EXT4_INODES_PER_GROUP(sb) / 8) ||
+	    ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) {
 		ext4_unlock_group(sb, block_group);
 		ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
 			   "inode_bitmap = %llu", block_group, blk);
@@ -192,6 +193,7 @@  ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 	get_bh(bh);
 	submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
 	wait_on_buffer(bh);
+	ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO);
 	if (!buffer_uptodate(bh)) {
 		put_bh(bh);
 		ext4_set_errno(sb, EIO);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b67ffa24ae0a..564f88040de9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4243,6 +4243,8 @@  static int __ext4_get_inode_loc(struct inode *inode,
 	bh = sb_getblk(sb, block);
 	if (unlikely(!bh))
 		return -ENOMEM;
+	if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO))
+		goto simulate_eio;
 	if (!buffer_uptodate(bh)) {
 		lock_buffer(bh);
 
@@ -4341,6 +4343,7 @@  static int __ext4_get_inode_loc(struct inode *inode,
 		blk_finish_plug(&plug);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
+		simulate_eio:
 			ext4_set_errno(inode->i_sb, EIO);
 			EXT4_ERROR_INODE_BLOCK(inode, block,
 					       "unable to read itable block");
@@ -4555,7 +4558,8 @@  struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 					      sizeof(gen));
 	}
 
-	if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
+	if (!ext4_inode_csum_verify(inode, raw_inode, ei) ||
+	    ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) {
 		ext4_set_errno(inode->i_sb, EFSBADCRC);
 		ext4_error_inode(inode, function, line, 0,
 				 "iget: checksum invalid");
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index cee7c28e070d..be14e33e5103 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -109,7 +109,10 @@  static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 	struct ext4_dir_entry *dirent;
 	int is_dx_block = 0;
 
-	bh = ext4_bread(NULL, inode, block, 0);
+	if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO))
+		bh = ERR_PTR(-EIO);
+	else
+		bh = ext4_bread(NULL, inode, block, 0);
 	if (IS_ERR(bh)) {
 		__ext4_warning(inode->i_sb, func, line,
 			       "inode #%lu: lblock %lu: comm %s: "
@@ -153,7 +156,8 @@  static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 	 * caller is sure it should be an index block.
 	 */
 	if (is_dx_block && type == INDEX) {
-		if (ext4_dx_csum_verify(inode, dirent))
+		if (ext4_dx_csum_verify(inode, dirent) &&
+		    !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
 			set_buffer_verified(bh);
 		else {
 			ext4_set_errno(inode->i_sb, EFSBADCRC);
@@ -164,7 +168,8 @@  static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 		}
 	}
 	if (!is_dx_block) {
-		if (ext4_dirblock_csum_verify(inode, bh))
+		if (ext4_dirblock_csum_verify(inode, bh) &&
+		    !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
 			set_buffer_verified(bh);
 		else {
 			ext4_set_errno(inode->i_sb, EFSBADCRC);
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index eb1efad0e20a..a990d28d191b 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -29,6 +29,7 @@  typedef enum {
 	attr_last_error_time,
 	attr_feature,
 	attr_pointer_ui,
+	attr_pointer_ul,
 	attr_pointer_atomic,
 	attr_journal_task,
 } attr_id_t;
@@ -160,6 +161,9 @@  static struct ext4_attr ext4_attr_##_name = {			\
 #define EXT4_RW_ATTR_SBI_UI(_name,_elname)	\
 	EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)
 
+#define EXT4_RW_ATTR_SBI_UL(_name,_elname)	\
+	EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname)
+
 #define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
 static struct ext4_attr ext4_attr_##_name = {			\
 	.attr = {.name = __stringify(_name), .mode = _mode },	\
@@ -194,6 +198,9 @@  EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int
 EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
 EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
 EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+#ifdef CONFIG_EXT4_DEBUG
+EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
+#endif
 EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
 EXT4_ATTR(first_error_time, 0444, first_error_time);
 EXT4_ATTR(last_error_time, 0444, last_error_time);
@@ -228,6 +235,9 @@  static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(first_error_time),
 	ATTR_LIST(last_error_time),
 	ATTR_LIST(journal_task),
+#ifdef CONFIG_EXT4_DEBUG
+	ATTR_LIST(simulate_fail),
+#endif
 	NULL,
 };
 ATTRIBUTE_GROUPS(ext4);
@@ -318,6 +328,11 @@  static ssize_t ext4_attr_show(struct kobject *kobj,
 		else
 			return snprintf(buf, PAGE_SIZE, "%u\n",
 					*((unsigned int *) ptr));
+	case attr_pointer_ul:
+		if (!ptr)
+			return 0;
+		return snprintf(buf, PAGE_SIZE, "%lu\n",
+				*((unsigned long *) ptr));
 	case attr_pointer_atomic:
 		if (!ptr)
 			return 0;
@@ -361,6 +376,14 @@  static ssize_t ext4_attr_store(struct kobject *kobj,
 		else
 			*((unsigned int *) ptr) = t;
 		return len;
+	case attr_pointer_ul:
+		if (!ptr)
+			return 0;
+		ret = kstrtoul(skip_spaces(buf), 0, &t);
+		if (ret)
+			return ret;
+		*((unsigned long *) ptr) = t;
+		return len;
 	case attr_inode_readahead:
 		return inode_readahead_blks_store(sbi, buf, len);
 	case attr_trigger_test_error: