Patchwork ext4: record error messages in message buffer

login
register
mail settings
Submitter Amir Goldstein
Date March 1, 2011, 5:27 a.m.
Message ID <AANLkTimksvJ1hTwE-pqidDzVh95GGtOyr0m0At+ovJ_d@mail.gmail.com>
Download mbox | patch
Permalink /patch/84894/
State New
Headers show

Comments

Amir Goldstein - March 1, 2011, 5:27 a.m.
Hi Ted,

I found this patch in my haystack.
Are you planning to merge it in upcoming merge window?

I will follow with latest version of fsck patch.

Thanks,
Amir.

---------- Forwarded message ----------
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, Jan 2, 2011 at 9:23 PM
Subject: Re: Regular ext4 error warning with HD in USB dock
To: Ric Wheeler <rwheeler@redhat.com>
Cc: Ted Ts'o <tytso@mit.edu>, Rogier Wolff <R.E.Wolff@bitwizard.nl>,
Con Kolivas <kernel@kolivas.org>, adilger.kernel@dilger.ca,
linux-ext4@vger.kernel.org


On Sat, Jan 1, 2011 at 7:20 PM, Ric Wheeler <rwheeler@redhat.com> wrote:
> On 12/28/2010 05:30 AM, Amir Goldstein wrote:
>>
>> On Tue, Dec 28, 2010 at 10:19 AM, Rogier Wolff<R.E.Wolff@bitwizard.nl>
>>  wrote:
>>>
>>> On Mon, Dec 27, 2010 at 09:53:43PM -0500, Ted Ts'o wrote:
>>>>
>>>> On Tue, Dec 28, 2010 at 09:53:45AM +1100, Con Kolivas wrote:
>>>>>
>>>>> [1048401.773270] EXT4-fs (sde8): mounted filesystem with writeback data
>>>>> mode.
>>>>> Opts: (null)
>>>>> [1048702.736011] EXT4-fs (sde8): error count: 3
>>>>> [1048702.736016] EXT4-fs (sde8): initial error at 1289053677:
>>>>> ext4_journal_start_sb:251
>>>>> [1048702.736018] EXT4-fs (sde8): last error at 1289080948:
>>>>> ext4_put_super:719
>>>>
>>>> That's actually not an error.  It's a report which is generated every
>>>> 24 hours, indicating that there has been 3 errors since the last time
>>>> the error count has been cleared, with the first error taking place at
>>>> Sat Nov 6 10:27:57 2010 (US/Eastern) in the function
>>>> ext4_journal_start_sb(), at line 251, and the most recent error taking
>>>> place at Sat Nov 6 18:02:28 2010 (US/Eastern), in the function
>>>> ext4_put_super() at line 719.  This is a new feature which was added
>>>> in 2.6.36.
>>>
>>> Nice. But the issue you're not mentioning is: What errors could have
>>> happened on the 6th of november? Should Con worry about those errors?
>>>
>> Ted,
>>
>> I would like to use this opportunity to remind you about my
>> record_journal_errstr()
>> implementation, see:
>>
>> https://github.com/amir73il/ext4-snapshots/blob/next3-stable/fs/next3/super.c#L157
>>
>> It records the initial errors messages (which I found to be the most
>> interesting),
>> in a message buffer on the unused space after the journal super block
>> (3K on a 4K block fs).
>>
>> fsck prints out those messages and clears the buffer.
>> In under a year of Next3 fs in beta and production, it has helped me many
>> times
>> to analyse bugs post-mortem and find the problem.
>>
>> If there is demand, I can post the patch for Ext4.
>>
>> Amir.
>
> I do think that this sounds like a useful addition - should be very useful
> in doing post mortem analysis...
>
> Thanks!
>
> Ric
>
>

Hi Ric,

Attached are the patches for ext4 (against Ted's next branch) and
e2fsck (against 1.41.14).

e2fsck patch is running in production with e2fsprogs-1.41.9 and the
porting to 1.41.14 was trivial.
ext4 patch is a 'trivial+' port of the Next3 patch running in production.

Do you think you guys can take the patches for a test drive?

Thanks,
Amir.


ext4: record error messages in message buffer

Ext4 error messages are stored in the free space after the journal super block.
After journal recovery, the message buffer is copied from the journal super
block to the file system super block.

When error behavior is configured to remount-ro, the message buffer can be read
and displayed at a later time by e2fsck. The messages from the time of the
error are very helpful in post-mortem bug analysis.

Signed-off-by: Amir Goldstein <amir73il@users.sf.net>


@@ -311,6 +354,9 @@ void ext4_journal_abort_handle(const char *caller,
unsigned int line,
       printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n",
              caller, line, errstr, err_fn);

+       /* record error message in journal super block */
+       ext4_record_journal_errstr(sb, caller, err_fn, errstr);
+
       jbd2_journal_abort_handle(handle);
 }

@@ -398,6 +444,11 @@ void __ext4_error(struct super_block *sb, const
char *function,
              sb->s_id, function, line, current->comm, &vaf);
       va_end(args);

+       /* record error message in journal super block */
+       va_start(args, fmt);
+       ext4_record_journal_err(sb, __func__, function, fmt, args);
+       va_end(args);
+
       ext4_handle_error(sb);
 }

@@ -422,6 +473,11 @@ void ext4_error_inode(struct inode *inode, const
char *function,
       printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf);
       va_end(args);

+       /* record error message in journal super block */
+       va_start(args, fmt);
+       ext4_record_journal_err(inode->i_sb, __func__, function, fmt, args);
+       va_end(args);
+
       ext4_handle_error(inode->i_sb);
 }

@@ -452,6 +508,11 @@ void ext4_error_file(struct file *file, const
char *function,
       printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf);
       va_end(args);

+       /* record error message in journal super block */
+       va_start(args, fmt);
+       ext4_record_journal_err(inode->i_sb, __func__, function, fmt, args);
+       va_end(args);
+
       ext4_handle_error(inode->i_sb);
 }

@@ -510,6 +571,9 @@ void __ext4_std_error(struct super_block *sb,
const char *function,
              sb->s_id, function, line, errstr);
       save_error_info(sb, function, line);

+       /* record error message in journal super block */
+       ext4_record_journal_errstr(sb, __func__, function, errstr);
+
       ext4_handle_error(sb);
 }

@@ -536,6 +600,11 @@ void __ext4_abort(struct super_block *sb, const
char *function,
       printk("\n");
       va_end(args);

+       /* record error message in journal super block */
+       va_start(args, fmt);
+       ext4_record_journal_err(sb, __func__, function, fmt, args);
+       va_end(args);
+
       if ((sb->s_flags & MS_RDONLY) == 0) {
               ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
               sb->s_flags |= MS_RDONLY;
@@ -602,6 +671,11 @@ __acquires(bitlock)
       printk(KERN_CONT "%pV\n", &vaf);
       va_end(args);

+       /* record error message in journal super block */
+       va_start(args, fmt);
+       ext4_record_journal_err(sb, __func__, function, fmt, args);
+       va_end(args);
+
       if (test_opt(sb, ERRORS_CONT)) {
               ext4_commit_super(sb, 0);
               return;
@@ -4067,12 +4141,35 @@ static void ext4_clear_journal_err(struct
super_block *sb,
       j_errno = jbd2_journal_errno(journal);
       if (j_errno) {
               char nbuf[16];
+               char *buf1, *buf2;
+               unsigned long offset1, offset2;
+               int len1, len2;
+
+               /* copy message buffer from journal to super block */
+               buf1 = (char *)journal->j_superblock;
+               offset1 = (unsigned long)buf1 % sb->s_blocksize;
+               buf1 += sizeof(journal_superblock_t);
+               offset1 += sizeof(journal_superblock_t);
+               len1 = sb->s_blocksize - offset1;
+               buf2 = (char *)EXT4_SB(sb)->s_es;
+               offset2 = (unsigned long)buf2 % sb->s_blocksize;
+               buf2 += sizeof(struct ext4_super_block);
+               offset2 += sizeof(struct ext4_super_block);
+               len2 = sb->s_blocksize - offset2;
+               if (len2 > len1)
+                       len2 = len1;
+               if (len2 > 0 && *buf1)
+                       memcpy(buf2, buf1, len2);

               errstr = ext4_decode_error(sb, j_errno, nbuf);
               ext4_warning(sb, "Filesystem error recorded "
                            "from previous mount: %s", errstr);
               ext4_warning(sb, "Marking fs in need of filesystem check.");

+               /* clear journal message buffer */
+               if (len1 > 0)
+                       memset(buf1, 0, len1);
+
               EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
               es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
               ext4_commit_super(sb, 1);

Patch

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7728a4c..8888815 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -290,10 +290,53 @@  int __ext4_journal_stop(const char *where,
unsigned int line, handle_t *handle)
       return err;
 }

+/* record error messages after journal super block */
+static void ext4_record_journal_err(struct super_block *sb, const char *where,
+               const char *function, const char *fmt, va_list args)
+{
+#define MSGLEN 256
+       journal_t *journal = EXT4_SB(sb)->s_journal;
+       char *buf;
+       unsigned long offset;
+       int len;
+
+       if (!journal)
+               return;
+
+       buf = (char *)journal->j_superblock;
+       offset = (unsigned long)buf % sb->s_blocksize;
+       buf += sizeof(journal_superblock_t);
+       offset += sizeof(journal_superblock_t);
+
+       /* seek to end of message buffer */
+       while (offset < sb->s_blocksize && *buf) {
+               buf += MSGLEN;
+               offset += MSGLEN;
+       }
+
+       if (offset+MSGLEN > sb->s_blocksize)
+               /* no space left in message buffer */
+               return;
+
+       len = snprintf(buf, MSGLEN, "%s: %s: ", where, function);
+       len += vsnprintf(buf+len, MSGLEN-len, fmt, args);
+}
+
+static void ext4_record_journal_errstr(struct super_block *sb,
+               const char *where, const char *function, ...)
+{
+       va_list args;
+
+       va_start(args, function);
+       ext4_record_journal_err(sb, where, function, "%s\n", args);
+       va_end(args);
+}
+
 void ext4_journal_abort_handle(const char *caller, unsigned int line,
                              const char *err_fn, struct buffer_head *bh,
                              handle_t *handle, int err)
 {
+       struct super_block *sb = handle->h_transaction->t_journal->j_private;
       char nbuf[16];
       const char *errstr = ext4_decode_error(NULL, err, nbuf);