diff mbox series

jbd2: avoid mount failed when commit block is partial submitted

Message ID 20240402090951.527619-1-yebin10@huawei.com
State New
Headers show
Series jbd2: avoid mount failed when commit block is partial submitted | expand

Commit Message

yebin (H) April 2, 2024, 9:09 a.m. UTC
We encountered a problem that the file system could not be mounted in
the power-off scenario. The analysis of the file system mirror shows that
only part of the data is written to the last commit block.
To solve above issue, if commit block checksum is incorrect, check the next
block if has valid magic and transaction ID. If next block hasn't valid
magic or transaction ID then just drop the last transaction ignore checksum
error. Theoretically, the transaction ID maybe occur loopback, which may cause
the mounting failure.

Signed-off-by: Ye Bin <yebin10@huawei.com>
---
 fs/jbd2/recovery.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

Comments

Jan Kara April 2, 2024, 1:42 p.m. UTC | #1
On Tue 02-04-24 17:09:51, Ye Bin wrote:
> We encountered a problem that the file system could not be mounted in
> the power-off scenario. The analysis of the file system mirror shows that
> only part of the data is written to the last commit block.
> To solve above issue, if commit block checksum is incorrect, check the next
> block if has valid magic and transaction ID. If next block hasn't valid
> magic or transaction ID then just drop the last transaction ignore checksum
> error. Theoretically, the transaction ID maybe occur loopback, which may cause
> the mounting failure.
> 
> Signed-off-by: Ye Bin <yebin10@huawei.com>

So this is curious. The commit block data is fully within one sector and
the expectation of the journaling is that either full sector or nothing is
written. So what kind of storage were you using that it breaks these
expectations?

								Honza

> ---
>  fs/jbd2/recovery.c | 39 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 39 insertions(+)
> 
> diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
> index 1f7664984d6e..0a09f1a5fd1e 100644
> --- a/fs/jbd2/recovery.c
> +++ b/fs/jbd2/recovery.c
> @@ -463,6 +463,41 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
>  		return tag->t_checksum == cpu_to_be16(csum32);
>  }
>  
> +static int check_incomplete_commit(journal_t *journal, unsigned long next_block,
> +				   unsigned int next_commit_ID)
> +{
> +	journal_header_t *tmp;
> +	struct buffer_head *bh;
> +	int err = 0;
> +
> +	err = jread(&bh, journal, next_block);
> +	if (err)
> +		return err;
> +
> +	tmp = (journal_header_t *)bh->b_data;
> +	/*
> +	 * If the next block does not contain consecutive transactions, it can
> +	 * be considered that the checksum error of the current commit block
> +	 * is caused by incomplete commit. Ignore the checksum error and drop
> +	 * the last transaction.
> +	 */
> +	if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
> +	    be32_to_cpu(tmp->h_sequence) != next_commit_ID) {
> +		jbd2_debug("JBD2: will drop incomplete transaction %u commit block %lu\n",
> +			   next_commit_ID - 1, next_block - 1);
> +		goto out;
> +	}
> +
> +	pr_err("JBD2: potential continuous transaction detected %u at %lu, "
> +	       "likely invalid checksum in transaction %u\n",
> +	       next_commit_ID,  next_block, next_commit_ID - 1);
> +
> +	err = -EFSBADCRC;
> +out:
> +	brelse(bh);
> +	return err;
> +}
> +
>  static int do_one_pass(journal_t *journal,
>  			struct recovery_info *info, enum passtype pass)
>  {
> @@ -810,6 +845,10 @@ static int do_one_pass(journal_t *journal,
>  			if (pass == PASS_SCAN &&
>  			    !jbd2_commit_block_csum_verify(journal,
>  							   bh->b_data)) {
> +				if (!check_incomplete_commit(journal,
> +							     next_log_block,
> +							     next_commit_ID + 1))
> +					goto ignore_crc_mismatch;
>  			chksum_error:
>  				if (commit_time < last_trans_commit_time)
>  					goto ignore_crc_mismatch;
> -- 
> 2.31.1
>
Theodore Ts'o April 3, 2024, 3:37 a.m. UTC | #2
On Tue, Apr 02, 2024 at 03:42:40PM +0200, Jan Kara wrote:
> On Tue 02-04-24 17:09:51, Ye Bin wrote:
> > We encountered a problem that the file system could not be mounted in
> > the power-off scenario. The analysis of the file system mirror shows that
> > only part of the data is written to the last commit block.
> > To solve above issue, if commit block checksum is incorrect, check the next
> > block if has valid magic and transaction ID. If next block hasn't valid
> > magic or transaction ID then just drop the last transaction ignore checksum
> > error. Theoretically, the transaction ID maybe occur loopback, which may cause
> > the mounting failure.
> > 
> > Signed-off-by: Ye Bin <yebin10@huawei.com>
> 
> So this is curious. The commit block data is fully within one sector and
> the expectation of the journaling is that either full sector or nothing is
> written. So what kind of storage were you using that it breaks these
> expectations?

I suppose if the physical sector size is 512 bytes, and the file
system block is 4k, I suppose it's possible that on a crash, that part
of the 4k commit block could be written.  In *practice* though, this
is super rare.  That's because on many modern HDD's, the physical
sector size is 4k (because the ECC overhead is much lower), even if
the logical sector size is 512 byte (for Windows 98 compatibility).
And even on HDD's where the physical sector size is really 512 bytes,
the way the sectors are laid out in a serpentine fashion, it is
*highly* likely that 4k write won't get torn.

And while this is *possible*, it's also possible that some kind of I/O
transfer error --- such as some bit flips which breaks the checksum on
the commit block, but also trashes the tid of the subsequent block,
such that your patch gets tricked into thinking that this is the
partial last commit, when in fact it's not the last commit, thus
causing the journal replay abort early.  If that's case, it's much
safer to force fsck to be run to detect any inconsistency that might
result.

In general, I strongly recommend that fsck be run on the file system
before you try to mount it.  Yeah, historically the root file system
gets mounted read-only, and then fsck gets run on it, and if
necessary, fsck will fix it up and then force a reboot.  Ye, I'm
assuming that this is what you're doing, and so that's why you really
don't want the mount to fail?

If so, the better way to address this is to use an initramfs which can
run fsck on the real root file system, and then mount it, and then use
pivot_root and then exec'ing the real init program.  That way, even
the journal is corrupted in that way, fsck will attempt to replay the
journal, fail, and you can have fsck do a forced fsck to fix up the
file system.

						- Ted
Jan Kara April 3, 2024, 10:11 a.m. UTC | #3
On Tue 02-04-24 23:37:42, Theodore Ts'o wrote:
> On Tue, Apr 02, 2024 at 03:42:40PM +0200, Jan Kara wrote:
> > On Tue 02-04-24 17:09:51, Ye Bin wrote:
> > > We encountered a problem that the file system could not be mounted in
> > > the power-off scenario. The analysis of the file system mirror shows that
> > > only part of the data is written to the last commit block.
> > > To solve above issue, if commit block checksum is incorrect, check the next
> > > block if has valid magic and transaction ID. If next block hasn't valid
> > > magic or transaction ID then just drop the last transaction ignore checksum
> > > error. Theoretically, the transaction ID maybe occur loopback, which may cause
> > > the mounting failure.
> > > 
> > > Signed-off-by: Ye Bin <yebin10@huawei.com>
> > 
> > So this is curious. The commit block data is fully within one sector and
> > the expectation of the journaling is that either full sector or nothing is
> > written. So what kind of storage were you using that it breaks these
> > expectations?
> 
> I suppose if the physical sector size is 512 bytes, and the file
> system block is 4k, I suppose it's possible that on a crash, that part
> of the 4k commit block could be written.

I was thinking about that as well but the commit block looks like:

truct commit_header {
        __be32          h_magic;
        __be32          h_blocktype;
        __be32          h_sequence;
        unsigned char   h_chksum_type;
        unsigned char   h_chksum_size;
        unsigned char   h_padding[2];
        __be32          h_chksum[JBD2_CHECKSUM_BYTES];
        __be64          h_commit_sec;
        __be32          h_commit_nsec;
};

where JBD2_CHECKSUM_BYTES is 8. So all the data in the commit block
including the checksum is in the first 60 bytes. Hence I would be really
surprised if some storage can tear that...

Hence either Ye Bin is running on some really exotic storage or the storage
/ CPU in fact flipped bits somewhere so that the checksum didn't match or
the commit block was in fact not written now but it was a leftover from
previous journal use and h_sequence happened to match. Very unlikely but
depending on how exactly they do their powerfail testing I can imagine it
would be possible with enough tries...

> In *practice* though, this
> is super rare.  That's because on many modern HDD's, the physical
> sector size is 4k (because the ECC overhead is much lower), even if
> the logical sector size is 512 byte (for Windows 98 compatibility).
> And even on HDD's where the physical sector size is really 512 bytes,
> the way the sectors are laid out in a serpentine fashion, it is
> *highly* likely that 4k write won't get torn.
> 
> And while this is *possible*, it's also possible that some kind of I/O
> transfer error --- such as some bit flips which breaks the checksum on
> the commit block, but also trashes the tid of the subsequent block,
> such that your patch gets tricked into thinking that this is the
> partial last commit, when in fact it's not the last commit, thus
> causing the journal replay abort early.  If that's case, it's much
> safer to force fsck to be run to detect any inconsistency that might
> result.

Yeah, I agree in these cases of a corrupted journal it seems dangerous to
just try to continue without fsck based on some heuristics.

								Honza
yebin (H) April 7, 2024, 1:37 a.m. UTC | #4
On 2024/4/3 18:11, Jan Kara wrote:
> On Tue 02-04-24 23:37:42, Theodore Ts'o wrote:
>> On Tue, Apr 02, 2024 at 03:42:40PM +0200, Jan Kara wrote:
>>> On Tue 02-04-24 17:09:51, Ye Bin wrote:
>>>> We encountered a problem that the file system could not be mounted in
>>>> the power-off scenario. The analysis of the file system mirror shows that
>>>> only part of the data is written to the last commit block.
>>>> To solve above issue, if commit block checksum is incorrect, check the next
>>>> block if has valid magic and transaction ID. If next block hasn't valid
>>>> magic or transaction ID then just drop the last transaction ignore checksum
>>>> error. Theoretically, the transaction ID maybe occur loopback, which may cause
>>>> the mounting failure.
>>>>
>>>> Signed-off-by: Ye Bin <yebin10@huawei.com>
>>> So this is curious. The commit block data is fully within one sector and
>>> the expectation of the journaling is that either full sector or nothing is
>>> written. So what kind of storage were you using that it breaks these
>>> expectations?
>> I suppose if the physical sector size is 512 bytes, and the file
>> system block is 4k, I suppose it's possible that on a crash, that part
>> of the 4k commit block could be written.
> I was thinking about that as well but the commit block looks like:
>
> truct commit_header {
>          __be32          h_magic;
>          __be32          h_blocktype;
>          __be32          h_sequence;
>          unsigned char   h_chksum_type;
>          unsigned char   h_chksum_size;
>          unsigned char   h_padding[2];
>          __be32          h_chksum[JBD2_CHECKSUM_BYTES];
>          __be64          h_commit_sec;
>          __be32          h_commit_nsec;
> };
>
> where JBD2_CHECKSUM_BYTES is 8. So all the data in the commit block
> including the checksum is in the first 60 bytes. Hence I would be really
> surprised if some storage can tear that...
This issue has been encountered a few times in the context of eMMC 
devices. The vendor
has confirmed that only 512-byte atomicity can be ensured in the firmware.
Although the valid data is only 60 bytes, the entire commit block is 
used for calculating
the checksum.
jbd2_commit_block_csum_verify:
...
calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
...
>
> Hence either Ye Bin is running on some really exotic storage or the storage
> / CPU in fact flipped bits somewhere so that the checksum didn't match or
> the commit block was in fact not written now but it was a leftover from
> previous journal use and h_sequence happened to match. Very unlikely but
> depending on how exactly they do their powerfail testing I can imagine it
> would be possible with enough tries...
>
>> In *practice* though, this
>> is super rare.  That's because on many modern HDD's, the physical
>> sector size is 4k (because the ECC overhead is much lower), even if
>> the logical sector size is 512 byte (for Windows 98 compatibility).
>> And even on HDD's where the physical sector size is really 512 bytes,
>> the way the sectors are laid out in a serpentine fashion, it is
>> *highly* likely that 4k write won't get torn.
>>
>> And while this is *possible*, it's also possible that some kind of I/O
>> transfer error --- such as some bit flips which breaks the checksum on
>> the commit block, but also trashes the tid of the subsequent block,
>> such that your patch gets tricked into thinking that this is the
>> partial last commit, when in fact it's not the last commit, thus
>> causing the journal replay abort early.  If that's case, it's much
>> safer to force fsck to be run to detect any inconsistency that might
>> result.
> Yeah, I agree in these cases of a corrupted journal it seems dangerous to
> just try to continue without fsck based on some heuristics.
>
> 								Honza
Jan Kara April 11, 2024, 1:37 p.m. UTC | #5
On Sun 07-04-24 09:37:25, yebin (H) wrote:
> On 2024/4/3 18:11, Jan Kara wrote:
> > On Tue 02-04-24 23:37:42, Theodore Ts'o wrote:
> > > On Tue, Apr 02, 2024 at 03:42:40PM +0200, Jan Kara wrote:
> > > > On Tue 02-04-24 17:09:51, Ye Bin wrote:
> > > > > We encountered a problem that the file system could not be mounted in
> > > > > the power-off scenario. The analysis of the file system mirror shows that
> > > > > only part of the data is written to the last commit block.
> > > > > To solve above issue, if commit block checksum is incorrect, check the next
> > > > > block if has valid magic and transaction ID. If next block hasn't valid
> > > > > magic or transaction ID then just drop the last transaction ignore checksum
> > > > > error. Theoretically, the transaction ID maybe occur loopback, which may cause
> > > > > the mounting failure.
> > > > > 
> > > > > Signed-off-by: Ye Bin <yebin10@huawei.com>
> > > > So this is curious. The commit block data is fully within one sector and
> > > > the expectation of the journaling is that either full sector or nothing is
> > > > written. So what kind of storage were you using that it breaks these
> > > > expectations?
> > > I suppose if the physical sector size is 512 bytes, and the file
> > > system block is 4k, I suppose it's possible that on a crash, that part
> > > of the 4k commit block could be written.
> > I was thinking about that as well but the commit block looks like:
> > 
> > truct commit_header {
> >          __be32          h_magic;
> >          __be32          h_blocktype;
> >          __be32          h_sequence;
> >          unsigned char   h_chksum_type;
> >          unsigned char   h_chksum_size;
> >          unsigned char   h_padding[2];
> >          __be32          h_chksum[JBD2_CHECKSUM_BYTES];
> >          __be64          h_commit_sec;
> >          __be32          h_commit_nsec;
> > };
> > 
> > where JBD2_CHECKSUM_BYTES is 8. So all the data in the commit block
> > including the checksum is in the first 60 bytes. Hence I would be really
> > surprised if some storage can tear that...
> This issue has been encountered a few times in the context of eMMC devices.
> The vendor
> has confirmed that only 512-byte atomicity can be ensured in the firmware.
> Although the valid data is only 60 bytes, the entire commit block is used
> for calculating
> the checksum.
> jbd2_commit_block_csum_verify:
> ...
> calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
> ...

Ah, indeed. This is the bit I've missed. Thanks for explanation! Still I
think trying to somehow automatically deal with wrong commit block checksum
is too dangerous because it can result in fs corruption in some (unlikely)
cases. OTOH I understand journal replay failure after a power fail isn't
great either so we need to think how to fix this...

								Honza
Theodore Ts'o April 11, 2024, 2:55 p.m. UTC | #6
On Thu, Apr 11, 2024 at 03:37:18PM +0200, Jan Kara wrote:
> > The vendor
> > has confirmed that only 512-byte atomicity can be ensured in the firmware.
> > Although the valid data is only 60 bytes, the entire commit block is used
> > for calculating
> > the checksum.
> > jbd2_commit_block_csum_verify:
> > ...
> > calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
> > ...
> 
> Ah, indeed. This is the bit I've missed. Thanks for explanation! Still I
> think trying to somehow automatically deal with wrong commit block checksum
> is too dangerous because it can result in fs corruption in some (unlikely)
> cases. OTOH I understand journal replay failure after a power fail isn't
> great either so we need to think how to fix this...

Unfortunately, the only fix I can think of would require changing how
we do the checksum to only include the portion of the jbd2 block which
contains valid data, per the header field.  This would be a format
change which means that if a new kernel writes the new jbd2 format
(using a journal incompat flag, or a new checksum type), older kernels
and older versions of e2fsprogs wouldn't be able to validate the
journal.  So rollout of the fix would have to be carefully managed.

					- Ted
yebin (H) April 12, 2024, 1:27 a.m. UTC | #7
On 2024/4/11 22:55, Theodore Ts'o wrote:
> On Thu, Apr 11, 2024 at 03:37:18PM +0200, Jan Kara wrote:
>>> The vendor
>>> has confirmed that only 512-byte atomicity can be ensured in the firmware.
>>> Although the valid data is only 60 bytes, the entire commit block is used
>>> for calculating
>>> the checksum.
>>> jbd2_commit_block_csum_verify:
>>> ...
>>> calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
>>> ...
>> Ah, indeed. This is the bit I've missed. Thanks for explanation! Still I
>> think trying to somehow automatically deal with wrong commit block checksum
>> is too dangerous because it can result in fs corruption in some (unlikely)
>> cases. OTOH I understand journal replay failure after a power fail isn't
>> great either so we need to think how to fix this...
> Unfortunately, the only fix I can think of would require changing how
> we do the checksum to only include the portion of the jbd2 block which
> contains valid data, per the header field.  This would be a format
> change which means that if a new kernel writes the new jbd2 format
> (using a journal incompat flag, or a new checksum type), older kernels
> and older versions of e2fsprogs wouldn't be able to validate the
> journal.  So rollout of the fix would have to be carefully managed.
>
> 					- Ted
> .
I thought of a solution that when the commit block checksum is 
incorrect, retain the
first 512 bytes of data, clear the subsequent data, and then calculate 
the checksum
to see if it is correct. This solution can distinguish whether the 
commit is complete
for components that can ensure the atomicity of 512 bytes or more. But 
for HDD,
it may not be able to distinguish, but it should be alleviated to some 
extent.
Theodore Ts'o April 12, 2024, 3:55 a.m. UTC | #8
On Fri, Apr 12, 2024 at 09:27:55AM +0800, yebin (H) wrote:
> I thought of a solution that when the commit block checksum is
> incorrect, retain the first 512 bytes of data, clear the subsequent
> data, and then calculate the checksum to see if it is correct. This
> solution can distinguish whether the commit is complete for
> components that can ensure the atomicity of 512 bytes or more. But
> for HDD, it may not be able to distinguish, but it should be
> alleviated to some extent.

Yeah, we discussed something similar at the weekly ext4 call; the idea
was to change the kernel to zero out the jbd2 block before we fill in
any jbd2 tags (including in the commit block) when writing the
journal.  Then in the journal replay path, if the checksum doesn't
match, we can try zeroing out everything beyond the size in the header
struct, and then retry the the checksum and see if it matches.

This also has the benefit of making sure that we aren't leaking stale
(uninitialized) kernel memory to disk, which could be considered a
security vulnerability in some cases --- although the likelihood that
something truly sensitive could be leaked is quite low; the attack
requires raw access to the storate device; and exposure similar to
what gets written to the swap device.  Still there are people who do
worry about such things.

						- Ted
diff mbox series

Patch

diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 1f7664984d6e..0a09f1a5fd1e 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -463,6 +463,41 @@  static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
 		return tag->t_checksum == cpu_to_be16(csum32);
 }
 
+static int check_incomplete_commit(journal_t *journal, unsigned long next_block,
+				   unsigned int next_commit_ID)
+{
+	journal_header_t *tmp;
+	struct buffer_head *bh;
+	int err = 0;
+
+	err = jread(&bh, journal, next_block);
+	if (err)
+		return err;
+
+	tmp = (journal_header_t *)bh->b_data;
+	/*
+	 * If the next block does not contain consecutive transactions, it can
+	 * be considered that the checksum error of the current commit block
+	 * is caused by incomplete commit. Ignore the checksum error and drop
+	 * the last transaction.
+	 */
+	if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
+	    be32_to_cpu(tmp->h_sequence) != next_commit_ID) {
+		jbd2_debug("JBD2: will drop incomplete transaction %u commit block %lu\n",
+			   next_commit_ID - 1, next_block - 1);
+		goto out;
+	}
+
+	pr_err("JBD2: potential continuous transaction detected %u at %lu, "
+	       "likely invalid checksum in transaction %u\n",
+	       next_commit_ID,  next_block, next_commit_ID - 1);
+
+	err = -EFSBADCRC;
+out:
+	brelse(bh);
+	return err;
+}
+
 static int do_one_pass(journal_t *journal,
 			struct recovery_info *info, enum passtype pass)
 {
@@ -810,6 +845,10 @@  static int do_one_pass(journal_t *journal,
 			if (pass == PASS_SCAN &&
 			    !jbd2_commit_block_csum_verify(journal,
 							   bh->b_data)) {
+				if (!check_incomplete_commit(journal,
+							     next_log_block,
+							     next_commit_ID + 1))
+					goto ignore_crc_mismatch;
 			chksum_error:
 				if (commit_time < last_trans_commit_time)
 					goto ignore_crc_mismatch;