diff mbox

ext4: Set file system to read-only by I/O error threshold

Message ID 1308312516-14252-1-git-send-email-wangshaoyan.pt@taobao.com
State New, archived
Headers show

Commit Message

stufever@gmail.com June 17, 2011, 12:08 p.m. UTC
From: Wang Shaoyan <wangshaoyan.pt@taobao.com>

Some version of Hadoop uses access(2) to check whether the data chunk harddisk is online, if access(2) returns error, hadoop marks the disk which it called access(2) as offline. This method works for Ext3/4 with journal, because when jbd/jbd2 encounters I/O error, the file system will be set as read-only. For Ext4 no-journal mode, there is no jdb2 to set the file system as read-only when I/O error happens, the access(2) from Hadoop is not able to reliably detect hard disk offline condition.

This patch tries to fix the above problem from kernel side. People can set I/O error threshold, in 2 conditions Ext4 file system without journal will be set as read-only:
1) inside the sampling interval, I/O errors come more then pre-set threshold happens
2) I/O errors always happen in continous sampling intervals, the sum of errors exceeds pre-set threshold

Then the application can find the file system is set as read-only, and call its own failure tolerance procedures.

There are 2 interface exported to user space via sysfs:
/sys/fs/ext4/sd[?]/eio_threshold --- I/O error threshold to set file system as read-only
/sys/fs/ext4/sd[?]/eio_interval  --- sampling interval in second

If echo 0 into file eio_threshold, I/O error threshold will be infinite, no file system read-only will be triggered.

Cc: Ted Tso <tytso@mit.edu>
Cc: Jan Kara <jack@suse.cz>
Reviewed-by: Coly Li <bosong.ly@taobao.com>
Reviewed-by: Liu Yuan <tailai.ly@taobao.com>
Signed-off-by: Wang Shaoyan <wangshaoyan.pt@taobao.com>

---
 fs/ext4/ext4.h  |    5 +++++
 fs/ext4/super.c |   51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 0 deletions(-)

Comments

Lukas Czerner June 18, 2011, 8:38 a.m. UTC | #1
On Fri, 17 Jun 2011, stufever@gmail.com wrote:

> From: Wang Shaoyan <wangshaoyan.pt@taobao.com>
> 
> Some version of Hadoop uses access(2) to check whether the data chunk harddisk is online, if access(2) returns error, hadoop marks the disk which it called access(2) as offline. This method works for Ext3/4 with journal, because when jbd/jbd2 encounters I/O error, the file system will be set as read-only. For Ext4 no-journal mode, there is no jdb2 to set the file system as read-only when I/O error happens, the access(2) from Hadoop is not able to reliably detect hard disk offline condition.

Hi,

so you're saying that you encounter I/O error on access(2) only with
Ext3/4 with journal. So given that you're checking the error count in
ext4_handle_error() which is called when I/O error happens I fail to see
how this helps your case. Am I missing something ?

Also I do not understand how this is helpful at all ? Usually when we
hit I/O error we want to have predictable behavior set by the error=
mount option, but with this patch we have absolutely unpredictable
behaviour on errors, which is bad! Also we can end up with read-only
file system even when errors=continue has been set.

Could you please provide a real use case for having error threshold ?
Because to me it does not seem like a very good idea.

Couple of comment bellow.

> 
> This patch tries to fix the above problem from kernel side. People can set I/O error threshold, in 2 conditions Ext4 file system without journal will be set as read-only:
> 1) inside the sampling interval, I/O errors come more then pre-set threshold happens
> 2) I/O errors always happen in continous sampling intervals, the sum of errors exceeds pre-set threshold
> 
> Then the application can find the file system is set as read-only, and call its own failure tolerance procedures.
> 
> There are 2 interface exported to user space via sysfs:
> /sys/fs/ext4/sd[?]/eio_threshold --- I/O error threshold to set file system as read-only
> /sys/fs/ext4/sd[?]/eio_interval  --- sampling interval in second
> 
> If echo 0 into file eio_threshold, I/O error threshold will be infinite, no file system read-only will be triggered.
> 
> Cc: Ted Tso <tytso@mit.edu>
> Cc: Jan Kara <jack@suse.cz>
> Reviewed-by: Coly Li <bosong.ly@taobao.com>
> Reviewed-by: Liu Yuan <tailai.ly@taobao.com>
> Signed-off-by: Wang Shaoyan <wangshaoyan.pt@taobao.com>
> 
> ---
>  fs/ext4/ext4.h  |    5 +++++
>  fs/ext4/super.c |   51 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 56 insertions(+), 0 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 1921392..8f445a8 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1108,6 +1108,11 @@ struct ext4_sb_info {
>  	int s_first_ino;
>  	unsigned int s_inode_readahead_blks;
>  	unsigned int s_inode_goal;
> +	spinlock_t s_eio_lock;

You can use atomic_t and get rid of the spinlock maybe ?

> +	unsigned int s_eio_threshold;
> +	unsigned int s_eio_interval;
> +	unsigned int s_eio_counter;
> +	unsigned long s_eio_last_jiffies;
>  	spinlock_t s_next_gen_lock;
>  	u32 s_next_generation;
>  	u32 s_hash_seed[4];
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index cc5c157..f85ddcd 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -384,6 +384,23 @@ static void save_error_info(struct super_block *sb, const char *func,
>  	ext4_commit_super(sb, 1);
>  }
>  
> +static inline void check_error_number(struct super_block *sb)

The name for this function should rather be inc_sb_error_count().

> +{
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +
> +	if (time_after(sbi->s_eio_last_jiffies + sbi->s_eio_interval * HZ, jiffies)) {
> +		sbi->s_eio_counter++;
> +	} else {
> +		sbi->s_eio_counter = 1;
> +	}
> +
> +	sbi->s_eio_last_jiffies = jiffies;
> +	ext4_msg(sb, KERN_CRIT, "count total: %d", sbi->s_eio_counter);
> +	
> +	if (sbi->s_eio_counter > sbi->s_eio_threshold) { 

I am not sure, but given that it it a "threshold" should not we trigger
it when we hit the threshold and not threshold+1 ?

> +		ext4_abort(sb, "Two many io error, abort it");

Could you use better error message ? This does not say nothing about why
it happened. Something about IO errors count reached the threshold ?

> +	}
> +}
>  
>  /* Deal with the reporting of failure conditions on a filesystem such as
>   * inconsistencies detected or read IO failures.
> @@ -402,9 +419,17 @@ static void save_error_info(struct super_block *sb, const char *func,
>  
>  static void ext4_handle_error(struct super_block *sb)
>  {
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +
>  	if (sb->s_flags & MS_RDONLY)
>  		return;
>  
> +	if (sbi->s_eio_threshold && !sbi->s_journal) {
> +		spin_lock(&sbi->s_eio_lock);
> +		check_error_number(sb);
> +		spin_unlock(&sbi->s_eio_lock);

Maybe you can use atomic operations and get rid of the spin_lock.

> +	}
> +
>  	if (!test_opt(sb, ERRORS_CONT)) {
>  		journal_t *journal = EXT4_SB(sb)->s_journal;
>  
> @@ -2471,6 +2496,22 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
>  	return count;
>  }
>  
> +static ssize_t eio_interval_store(struct ext4_attr *a,
> +					  struct ext4_sb_info *sbi,
> +					  const char *buf, size_t count)
> +{
> +	unsigned long t;
> +
> +	if (parse_strtoul(buf, 0xffffffff, &t))
> +		return -EINVAL;
> +
> +	if (t <= 0)
> +		return -EINVAL;
> +
> +	sbi->s_eio_interval = t;
> +	return count;
> +}
> +
>  static ssize_t sbi_ui_show(struct ext4_attr *a,
>  			   struct ext4_sb_info *sbi, char *buf)
>  {
> @@ -2524,6 +2565,9 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
>  EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
>  EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
>  EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
> +EXT4_RW_ATTR_SBI_UI(eio_threshold, s_eio_threshold);
> +EXT4_ATTR_OFFSET(eio_interval, 0644, sbi_ui_show,
> +		 eio_interval_store, s_eio_interval);
>  
>  static struct attribute *ext4_attrs[] = {
>  	ATTR_LIST(delayed_allocation_blocks),
> @@ -2540,6 +2584,8 @@ static struct attribute *ext4_attrs[] = {
>  	ATTR_LIST(mb_stream_req),
>  	ATTR_LIST(mb_group_prealloc),
>  	ATTR_LIST(max_writeback_mb_bump),
> +	ATTR_LIST(eio_threshold),
> +	ATTR_LIST(eio_interval),
>  	NULL,
>  };
>  
> @@ -3464,6 +3510,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  	sbi->s_stripe = ext4_get_stripe_size(sbi);
>  	sbi->s_max_writeback_mb_bump = 128;
>  
> +	spin_lock_init(&sbi->s_eio_lock);
> +	sbi->s_eio_threshold = 10;
> +	sbi->s_eio_interval = 5;
> +	sbi->s_eio_counter = 0;
> +
>  	/*
>  	 * set up enough so that it can read an inode
>  	 */
>
stufever@gmail.com June 20, 2011, 1:32 a.m. UTC | #2
2011/6/18 Lukas Czerner <lczerner@redhat.com>:

> Hi,
>
> so you're saying that you encounter I/O error on access(2) only with
> Ext3/4 with journal. So given that you're checking the error count in
> ext4_handle_error() which is called when I/O error happens I fail to see
> how this helps your case. Am I missing something ?
>
Only when access(2) return "Read-only file system", hadoop will mark
the disk as offline. For Ext4 no-journal mode, there is no jdb2 to set
the file system as
read-only when I/O error happens, so we set an threshold, when io
error number reach this number, we change the filesystem to read-only.
I use ext4_abort(), maybe it is wrong?
> Also I do not understand how this is helpful at all ? Usually when we
> hit I/O error we want to have predictable behavior set by the error=
> mount option, but with this patch we have absolutely unpredictable
> behaviour on errors, which is bad! Also we can end up with read-only
> file system even when errors=continue has been set.
>
In ext4 without journal, when the disk drops, the fs can't be
readonly. But in ext3/4 with journal, jbd2 will abort the filesystem,
change fs to be read-only. So we don't care what kind of error happen,
we just want to change fs to be read-only when there are too many
errors

> You can use atomic_t and get rid of the spinlock maybe ?
>
Yes, thanks

> The name for this function should rather be inc_sb_error_count().
Thanks

> I am not sure, but given that it it a "threshold" should not we trigger
> it when we hit the threshold and not threshold+1 ?
Thanks, I should use ">="

> Could you use better error message ? This does not say nothing about why
> it happened. Something about IO errors count reached the threshold ?
Yes,  IO errors count reached the threshold, we need change fs to be readonly

> Maybe you can use atomic operations and get rid of the spin_lock.
spin_lock is just a "lazy approach"
Jan Kara June 20, 2011, 1:36 p.m. UTC | #3
On Fri 17-06-11 20:08:36, stufever@gmail.com wrote:
> From: Wang Shaoyan <wangshaoyan.pt@taobao.com>
> 
> Some version of Hadoop uses access(2) to check whether the data chunk
> harddisk is online, if access(2) returns error, hadoop marks the disk
> which it called access(2) as offline. This method works for Ext3/4 with
> journal, because when jbd/jbd2 encounters I/O error, the file system will
> be set as read-only. For Ext4 no-journal mode, there is no jdb2 to set
> the file system as read-only when I/O error happens, the access(2) from
> Hadoop is not able to reliably detect hard disk offline condition.
> 
> This patch tries to fix the above problem from kernel side. People can
> set I/O error threshold, in 2 conditions Ext4 file system without journal
> will be set as read-only:
> 1) inside the sampling interval, I/O errors come more then pre-set
> threshold happens
> 2) I/O errors always happen in continous sampling intervals, the sum of
> errors exceeds pre-set threshold
> 
> Then the application can find the file system is set as read-only, and
> call its own failure tolerance procedures.
> 
> There are 2 interface exported to user space via sysfs:
> /sys/fs/ext4/sd[?]/eio_threshold --- I/O error threshold to set file system as read-only
> /sys/fs/ext4/sd[?]/eio_interval  --- sampling interval in second
> 
> If echo 0 into file eio_threshold, I/O error threshold will be infinite, no file system read-only will be triggered.
  Hum, if I understand your problem right, you should just mount the
filesystem with errors=remount-ro and you will get the behavior you need.
Or what is insufficient on that solution? Your patch surely provides more
flexibility but is that really needed?

BTW, in cluster environment (which Hadoop seems to be AFAIU) it is standard
to mount filesystem even with stricter errors=panic so that node is taken
off the grid as soon as some problem happens. Usually handling service
failover is simpler than handling uncertain state after a filesystem error.

								Honza
> Cc: Ted Tso <tytso@mit.edu>
> Cc: Jan Kara <jack@suse.cz>
> Reviewed-by: Coly Li <bosong.ly@taobao.com>
> Reviewed-by: Liu Yuan <tailai.ly@taobao.com>
> Signed-off-by: Wang Shaoyan <wangshaoyan.pt@taobao.com>
> 
> ---
>  fs/ext4/ext4.h  |    5 +++++
>  fs/ext4/super.c |   51 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 56 insertions(+), 0 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 1921392..8f445a8 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1108,6 +1108,11 @@ struct ext4_sb_info {
>  	int s_first_ino;
>  	unsigned int s_inode_readahead_blks;
>  	unsigned int s_inode_goal;
> +	spinlock_t s_eio_lock;
> +	unsigned int s_eio_threshold;
> +	unsigned int s_eio_interval;
> +	unsigned int s_eio_counter;
> +	unsigned long s_eio_last_jiffies;
>  	spinlock_t s_next_gen_lock;
>  	u32 s_next_generation;
>  	u32 s_hash_seed[4];
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index cc5c157..f85ddcd 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -384,6 +384,23 @@ static void save_error_info(struct super_block *sb, const char *func,
>  	ext4_commit_super(sb, 1);
>  }
>  
> +static inline void check_error_number(struct super_block *sb)
> +{
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +
> +	if (time_after(sbi->s_eio_last_jiffies + sbi->s_eio_interval * HZ, jiffies)) {
> +		sbi->s_eio_counter++;
> +	} else {
> +		sbi->s_eio_counter = 1;
> +	}
> +
> +	sbi->s_eio_last_jiffies = jiffies;
> +	ext4_msg(sb, KERN_CRIT, "count total: %d", sbi->s_eio_counter);
> +	
> +	if (sbi->s_eio_counter > sbi->s_eio_threshold) { 
> +		ext4_abort(sb, "Two many io error, abort it");
> +	}
> +}
>  
>  /* Deal with the reporting of failure conditions on a filesystem such as
>   * inconsistencies detected or read IO failures.
> @@ -402,9 +419,17 @@ static void save_error_info(struct super_block *sb, const char *func,
>  
>  static void ext4_handle_error(struct super_block *sb)
>  {
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +
>  	if (sb->s_flags & MS_RDONLY)
>  		return;
>  
> +	if (sbi->s_eio_threshold && !sbi->s_journal) {
> +		spin_lock(&sbi->s_eio_lock);
> +		check_error_number(sb);
> +		spin_unlock(&sbi->s_eio_lock);
> +	}
> +
>  	if (!test_opt(sb, ERRORS_CONT)) {
>  		journal_t *journal = EXT4_SB(sb)->s_journal;
>  
> @@ -2471,6 +2496,22 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
>  	return count;
>  }
>  
> +static ssize_t eio_interval_store(struct ext4_attr *a,
> +					  struct ext4_sb_info *sbi,
> +					  const char *buf, size_t count)
> +{
> +	unsigned long t;
> +
> +	if (parse_strtoul(buf, 0xffffffff, &t))
> +		return -EINVAL;
> +
> +	if (t <= 0)
> +		return -EINVAL;
> +
> +	sbi->s_eio_interval = t;
> +	return count;
> +}
> +
>  static ssize_t sbi_ui_show(struct ext4_attr *a,
>  			   struct ext4_sb_info *sbi, char *buf)
>  {
> @@ -2524,6 +2565,9 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
>  EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
>  EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
>  EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
> +EXT4_RW_ATTR_SBI_UI(eio_threshold, s_eio_threshold);
> +EXT4_ATTR_OFFSET(eio_interval, 0644, sbi_ui_show,
> +		 eio_interval_store, s_eio_interval);
>  
>  static struct attribute *ext4_attrs[] = {
>  	ATTR_LIST(delayed_allocation_blocks),
> @@ -2540,6 +2584,8 @@ static struct attribute *ext4_attrs[] = {
>  	ATTR_LIST(mb_stream_req),
>  	ATTR_LIST(mb_group_prealloc),
>  	ATTR_LIST(max_writeback_mb_bump),
> +	ATTR_LIST(eio_threshold),
> +	ATTR_LIST(eio_interval),
>  	NULL,
>  };
>  
> @@ -3464,6 +3510,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  	sbi->s_stripe = ext4_get_stripe_size(sbi);
>  	sbi->s_max_writeback_mb_bump = 128;
>  
> +	spin_lock_init(&sbi->s_eio_lock);
> +	sbi->s_eio_threshold = 10;
> +	sbi->s_eio_interval = 5;
> +	sbi->s_eio_counter = 0;
> +
>  	/*
>  	 * set up enough so that it can read an inode
>  	 */
> -- 
> 1.7.4.1
>
stufever@gmail.com June 20, 2011, 2:12 p.m. UTC | #4
Thanks for your reply!
2011/6/20 Jan Kara <jack@suse.cz>:

>  Hum, if I understand your problem right, you should just mount the
> filesystem with errors=remount-ro and you will get the behavior you need.
> Or what is insufficient on that solution? Your patch surely provides more
> flexibility but is that really needed?
>

1.There are more than ten hard disks in each of our production
machine, so it is not right for
making the whole system panic, only based on one error in one harddisk.
2.There may be multiple tasks which access the same hard drive at the
same time, so it is
not ideal for changing the system to readonly, only based on one error
in one task,
while other task may be killed.

That's why we have a relaxed restrictions, only when the error counter
is grower than our
threshold, we change fs to readonly or panic.
When a system has a dozen hard drives, each hard drive is running
several tasks on time,
this feature is a real demand.

> BTW, in cluster environment (which Hadoop seems to be AFAIU) it is standard
> to mount filesystem even with stricter errors=panic so that node is taken
> off the grid as soon as some problem happens. Usually handling service
> failover is simpler than handling uncertain state after a filesystem error.
>
Jan Kara June 20, 2011, 2:41 p.m. UTC | #5
On Mon 20-06-11 22:12:48, Wang Shaoyan wrote:
> Thanks for your reply!
> 2011/6/20 Jan Kara <jack@suse.cz>:
> 
> >  Hum, if I understand your problem right, you should just mount the
> > filesystem with errors=remount-ro and you will get the behavior you need.
> > Or what is insufficient on that solution? Your patch surely provides more
> > flexibility but is that really needed?
> >
> 
> 1.There are more than ten hard disks in each of our production
> machine, so it is not right for
> making the whole system panic, only based on one error in one harddisk.
> 2.There may be multiple tasks which access the same hard drive at the
> same time, so it is
> not ideal for changing the system to readonly, only based on one error
> in one task,
> while other task may be killed.
> 
> That's why we have a relaxed restrictions, only when the error counter
> is grower than our
> threshold, we change fs to readonly or panic.
> When a system has a dozen hard drives, each hard drive is running
> several tasks on time,
> this feature is a real demand.
  OK, but then your changelog is just misleading because your need for the
feature has nothing to do with different error behavior in ext4 nojournal
mode as far as I understand. So please describe your real needs in the
changelog as you described them above...

That being said I'm not sure such policies like when to remount fs
read-only belong to the kernel. It would seem more appropriate to me to
somehow propagate information about all IO errors to user space and have
some monitoring daemon (be it Hadoop or Nagios or whatever people use)
handle remounting the filesystem and marking the disk as offline. But
this is just my opinion and I don't want to block this feature. It's more
a suggestion for other people...

								Honza
Theodore Ts'o June 21, 2011, 2:48 p.m. UTC | #6
Ugh.  This is really, really, *really* ugly.  If you really want to
have hadoop shut down when there are too many errors, it's much better
to expose the number of EIO errors via sysfs, and then have some kind
of system daemon do the right thing.

But actually converting EIO's to file system errors isn't really a
good idea.  Consider that most of the time, when you get a read error
from the disk, if you rewrite that block, all will be will.  So taking
the entire disk off-line, and setting the errors fs bit won't really
help.  (a) Until the block is rewritten, the next time you try to read
it, you'll get an error, and (b) running fsck will be a waste of time,
since it will only scan the metadata blocks, and so the data block
will still have an error.

I assume you're using hadoopfs as your cluster file system, which has
redundancy at the file system level, right?  So getting an EIO won't
be the end of the world, since you can always read data chunk from a
redundant copy, or perform a reed-solomon reconstruction.  In fact,
disabling the entire file system is the worst thing you can do, since
you lose access to the rest of the files, which increases network
track to your cluster interconnect, especially if you have to do a R-S
reconstruction.

(In fact I've recently written up a plan to turn metadata errors into
EIO's, without bringing down the entire file system as containing
errors, to make the file system more resiliant to I/O errors --- the
exact reverse of what you're trying to do.)

For data I/O errors, what you in fact what to do is to handle them in
userspace, and just have HDFS delete the local copy of the file.  The
next time you allocate the space and rewrite the block, the disk will
do a bad block remap, and you'll be OK.

Now, you may want to do different things if the disk has completely
disappeared, or has completely died, so this is a case where it would
be desirable to get finer grained error reporting from the block I/O
layer --- there's a big difference between what you do for an error
caused by an unreadable block, and one caused by disk controller
bursting into flame.  But in general, remounting the file system
read-only should be a last-resort thing, and not the first thing you
should try doing.

Regards,

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger June 21, 2011, 3:58 p.m. UTC | #7
On 2011-06-21, at 8:48 AM, Ted Ts'o <tytso@mit.edu> wrote:
> In fact I've recently written up a plan to turn metadata errors into
> EIO's, without bringing down the entire file system as containing
> errors, to make the file system more resiliant to I/O errors --- the
> exact reverse of what you're trying to do.

Could you share that plan on the list?  

Cheers, Andreas
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1921392..8f445a8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1108,6 +1108,11 @@  struct ext4_sb_info {
 	int s_first_ino;
 	unsigned int s_inode_readahead_blks;
 	unsigned int s_inode_goal;
+	spinlock_t s_eio_lock;
+	unsigned int s_eio_threshold;
+	unsigned int s_eio_interval;
+	unsigned int s_eio_counter;
+	unsigned long s_eio_last_jiffies;
 	spinlock_t s_next_gen_lock;
 	u32 s_next_generation;
 	u32 s_hash_seed[4];
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cc5c157..f85ddcd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -384,6 +384,23 @@  static void save_error_info(struct super_block *sb, const char *func,
 	ext4_commit_super(sb, 1);
 }
 
+static inline void check_error_number(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (time_after(sbi->s_eio_last_jiffies + sbi->s_eio_interval * HZ, jiffies)) {
+		sbi->s_eio_counter++;
+	} else {
+		sbi->s_eio_counter = 1;
+	}
+
+	sbi->s_eio_last_jiffies = jiffies;
+	ext4_msg(sb, KERN_CRIT, "count total: %d", sbi->s_eio_counter);
+	
+	if (sbi->s_eio_counter > sbi->s_eio_threshold) { 
+		ext4_abort(sb, "Two many io error, abort it");
+	}
+}
 
 /* Deal with the reporting of failure conditions on a filesystem such as
  * inconsistencies detected or read IO failures.
@@ -402,9 +419,17 @@  static void save_error_info(struct super_block *sb, const char *func,
 
 static void ext4_handle_error(struct super_block *sb)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
 	if (sb->s_flags & MS_RDONLY)
 		return;
 
+	if (sbi->s_eio_threshold && !sbi->s_journal) {
+		spin_lock(&sbi->s_eio_lock);
+		check_error_number(sb);
+		spin_unlock(&sbi->s_eio_lock);
+	}
+
 	if (!test_opt(sb, ERRORS_CONT)) {
 		journal_t *journal = EXT4_SB(sb)->s_journal;
 
@@ -2471,6 +2496,22 @@  static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 	return count;
 }
 
+static ssize_t eio_interval_store(struct ext4_attr *a,
+					  struct ext4_sb_info *sbi,
+					  const char *buf, size_t count)
+{
+	unsigned long t;
+
+	if (parse_strtoul(buf, 0xffffffff, &t))
+		return -EINVAL;
+
+	if (t <= 0)
+		return -EINVAL;
+
+	sbi->s_eio_interval = t;
+	return count;
+}
+
 static ssize_t sbi_ui_show(struct ext4_attr *a,
 			   struct ext4_sb_info *sbi, char *buf)
 {
@@ -2524,6 +2565,9 @@  EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
 EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_RW_ATTR_SBI_UI(eio_threshold, s_eio_threshold);
+EXT4_ATTR_OFFSET(eio_interval, 0644, sbi_ui_show,
+		 eio_interval_store, s_eio_interval);
 
 static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(delayed_allocation_blocks),
@@ -2540,6 +2584,8 @@  static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(mb_stream_req),
 	ATTR_LIST(mb_group_prealloc),
 	ATTR_LIST(max_writeback_mb_bump),
+	ATTR_LIST(eio_threshold),
+	ATTR_LIST(eio_interval),
 	NULL,
 };
 
@@ -3464,6 +3510,11 @@  static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 	sbi->s_max_writeback_mb_bump = 128;
 
+	spin_lock_init(&sbi->s_eio_lock);
+	sbi->s_eio_threshold = 10;
+	sbi->s_eio_interval = 5;
+	sbi->s_eio_counter = 0;
+
 	/*
 	 * set up enough so that it can read an inode
 	 */