Patchwork e2fsck: fix 64-bit journal support

login
register
mail settings
Submitter Theodore Ts'o
Date May 22, 2012, 1:42 a.m.
Message ID <E1SWe75-0007rd-TQ@tytso-glaptop.cam.corp.google.com>
Download mbox | patch
Permalink /patch/160521/
State Accepted
Headers show

Comments

Theodore Ts'o - May 22, 2012, 1:42 a.m.
I just found what a somewhat disturbing bug which is still present in
1.42.3 --- namely, that the journal replay code in e2fsck was not
properly handling 64-bit file systems correctly, by dropping the high
bits of the block number from the jbd2 descriptor blocks.

I have not had a chance to fully test to make sure we don't have other
bugs hiding in the 64-bit code, but it's clear no one else has had a lot
of time to test it either -- at least not to the extent of doing power
fail testing of > 16TB file systems!   :-(

							- Ted

From 3b693d0b03569795d04920a04a0a21e5f64ffedc Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 21 May 2012 21:30:45 -0400
Subject: [PATCH] e2fsck: fix 64-bit journal support

64-bit journal support was broken; we weren't using the high bits from
the journal descriptor blocks!  We were also using "unsigned long" for
the journal block numbers, which would be a problem on 32-bit systems.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 e2fsck/jfs_user.h |    4 ++--
 e2fsck/journal.c  |   33 +++++++++++++++++----------------
 e2fsck/recovery.c |   25 ++++++++++++-------------
 3 files changed, 31 insertions(+), 31 deletions(-)
Eric Sandeen - May 22, 2012, 3:24 p.m.
On 5/21/12 8:42 PM, Theodore Ts'o wrote:
> I just found what a somewhat disturbing bug which is still present in
> 1.42.3 --- namely, that the journal replay code in e2fsck was not
> properly handling 64-bit file systems correctly, by dropping the high
> bits of the block number from the jbd2 descriptor blocks.
> 
> I have not had a chance to fully test to make sure we don't have other
> bugs hiding in the 64-bit code, but it's clear no one else has had a lot
> of time to test it either -- at least not to the extent of doing power
> fail testing of > 16TB file systems!   :-(

Whoops.  :(

How do you decide between using "unsigned long long" and "blk64_t" below?

If they'd all been blk_t's they'd have been easier to spot, if anyone went
looking, I suppose.

-Eric

> 							- Ted
> 
> From 3b693d0b03569795d04920a04a0a21e5f64ffedc Mon Sep 17 00:00:00 2001
> From: Theodore Ts'o <tytso@mit.edu>
> Date: Mon, 21 May 2012 21:30:45 -0400
> Subject: [PATCH] e2fsck: fix 64-bit journal support
> 
> 64-bit journal support was broken; we weren't using the high bits from
> the journal descriptor blocks!  We were also using "unsigned long" for
> the journal block numbers, which would be a problem on 32-bit systems.
> 
> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
> ---
>  e2fsck/jfs_user.h |    4 ++--
>  e2fsck/journal.c  |   33 +++++++++++++++++----------------
>  e2fsck/recovery.c |   25 ++++++++++++-------------
>  3 files changed, 31 insertions(+), 31 deletions(-)
> 
> diff --git a/e2fsck/jfs_user.h b/e2fsck/jfs_user.h
> index 9e33306..92f8ae2 100644
> --- a/e2fsck/jfs_user.h
> +++ b/e2fsck/jfs_user.h
> @@ -18,7 +18,7 @@ struct buffer_head {
>  	e2fsck_t	b_ctx;
>  	io_channel 	b_io;
>  	int	 	b_size;
> -	blk_t	 	b_blocknr;
> +	unsigned long long b_blocknr;
>  	int	 	b_dirty;
>  	int	 	b_uptodate;
>  	int	 	b_err;
> @@ -121,7 +121,7 @@ _INLINE_ size_t journal_tag_bytes(journal_t *journal)
>  /*
>   * Kernel compatibility functions are defined in journal.c
>   */
> -int journal_bmap(journal_t *journal, blk64_t block, unsigned long *phys);
> +int journal_bmap(journal_t *journal, blk64_t block, unsigned long long *phys);
>  struct buffer_head *getblk(kdev_t ctx, blk64_t blocknr, int blocksize);
>  void sync_blockdev(kdev_t kdev);
>  void ll_rw_block(int rw, int dummy, struct buffer_head *bh[]);
> diff --git a/e2fsck/journal.c b/e2fsck/journal.c
> index 915b8bb..bada028 100644
> --- a/e2fsck/journal.c
> +++ b/e2fsck/journal.c
> @@ -44,7 +44,7 @@ static int bh_count = 0;
>   * to use the recovery.c file virtually unchanged from the kernel, so we
>   * don't have to do much to keep kernel and user recovery in sync.
>   */
> -int journal_bmap(journal_t *journal, blk64_t block, unsigned long *phys)
> +int journal_bmap(journal_t *journal, blk64_t block, unsigned long long *phys)
>  {
>  #ifdef USE_INODE_IO
>  	*phys = block;
> @@ -80,8 +80,8 @@ struct buffer_head *getblk(kdev_t kdev, blk64_t blocknr, int blocksize)
>  	if (journal_enable_debug >= 3)
>  		bh_count++;
>  #endif
> -	jfs_debug(4, "getblk for block %lu (%d bytes)(total %d)\n",
> -		  (unsigned long) blocknr, blocksize, bh_count);
> +	jfs_debug(4, "getblk for block %llu (%d bytes)(total %d)\n",
> +		  (unsigned long long) blocknr, blocksize, bh_count);
>  
>  	bh->b_ctx = kdev->k_ctx;
>  	if (kdev->k_dev == K_DEV_FS)
> @@ -114,38 +114,39 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhp[])
>  	for (; nr > 0; --nr) {
>  		bh = *bhp++;
>  		if (rw == READ && !bh->b_uptodate) {
> -			jfs_debug(3, "reading block %lu/%p\n",
> -				  (unsigned long) bh->b_blocknr, (void *) bh);
> +			jfs_debug(3, "reading block %llu/%p\n",
> +				  bh->b_blocknr, (void *) bh);
>  			retval = io_channel_read_blk64(bh->b_io,
>  						     bh->b_blocknr,
>  						     1, bh->b_data);
>  			if (retval) {
>  				com_err(bh->b_ctx->device_name, retval,
> -					"while reading block %lu\n",
> -					(unsigned long) bh->b_blocknr);
> +					"while reading block %llu\n",
> +					bh->b_blocknr);
>  				bh->b_err = retval;
>  				continue;
>  			}
>  			bh->b_uptodate = 1;
>  		} else if (rw == WRITE && bh->b_dirty) {
> -			jfs_debug(3, "writing block %lu/%p\n",
> -				  (unsigned long) bh->b_blocknr, (void *) bh);
> +			jfs_debug(3, "writing block %llu/%p\n",
> +				  bh->b_blocknr,
> +				  (void *) bh);
>  			retval = io_channel_write_blk64(bh->b_io,
>  						      bh->b_blocknr,
>  						      1, bh->b_data);
>  			if (retval) {
>  				com_err(bh->b_ctx->device_name, retval,
> -					"while writing block %lu\n",
> -					(unsigned long) bh->b_blocknr);
> +					"while writing block %llu\n",
> +					bh->b_blocknr);
>  				bh->b_err = retval;
>  				continue;
>  			}
>  			bh->b_dirty = 0;
>  			bh->b_uptodate = 1;
>  		} else {
> -			jfs_debug(3, "no-op %s for block %lu\n",
> +			jfs_debug(3, "no-op %s for block %llu\n",
>  				  rw == READ ? "read" : "write",
> -				  (unsigned long) bh->b_blocknr);
> +				  bh->b_blocknr);
>  		}
>  	}
>  }
> @@ -164,8 +165,8 @@ void brelse(struct buffer_head *bh)
>  {
>  	if (bh->b_dirty)
>  		ll_rw_block(WRITE, 1, &bh);
> -	jfs_debug(3, "freeing block %lu/%p (total %d)\n",
> -		  (unsigned long) bh->b_blocknr, (void *) bh, --bh_count);
> +	jfs_debug(3, "freeing block %llu/%p (total %d)\n",
> +		  bh->b_blocknr, (void *) bh, --bh_count);
>  	ext2fs_free_mem(&bh);
>  }
>  
> @@ -237,7 +238,7 @@ static errcode_t e2fsck_get_journal(e2fsck_t ctx, journal_t **ret_journal)
>  	journal_t		*journal = NULL;
>  	errcode_t		retval = 0;
>  	io_manager		io_ptr = 0;
> -	unsigned long		start = 0;
> +	unsigned long long	start = 0;
>  	int			ext_journal = 0;
>  	int			tried_backup_jnl = 0;
>  
> diff --git a/e2fsck/recovery.c b/e2fsck/recovery.c
> index b669941..e94ef4e 100644
> --- a/e2fsck/recovery.c
> +++ b/e2fsck/recovery.c
> @@ -71,7 +71,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
>  {
>  	int err;
>  	unsigned int max, nbufs, next;
> -	unsigned long blocknr;
> +	unsigned long long blocknr;
>  	struct buffer_head *bh;
>  
>  	struct buffer_head * bufs[MAXBUF];
> @@ -133,7 +133,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
>  		 unsigned int offset)
>  {
>  	int err;
> -	unsigned long blocknr;
> +	unsigned long long blocknr;
>  	struct buffer_head *bh;
>  
>  	*bhp = NULL;
> @@ -309,7 +309,6 @@ int journal_skip_recovery(journal_t *journal)
>  	return err;
>  }
>  
> -#if 0
>  static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
>  {
>  	unsigned long long block = be32_to_cpu(tag->t_blocknr);
> @@ -317,17 +316,16 @@ static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag
>  		block |= (__u64)be32_to_cpu(tag->t_blocknr_high) << 32;
>  	return block;
>  }
> -#endif
>  
>  /*
>   * calc_chksums calculates the checksums for the blocks described in the
>   * descriptor block.
>   */
>  static int calc_chksums(journal_t *journal, struct buffer_head *bh,
> -			unsigned long *next_log_block, __u32 *crc32_sum)
> +			unsigned long long *next_log_block, __u32 *crc32_sum)
>  {
>  	int i, num_blks, err;
> -	unsigned long io_block;
> +	unsigned long long io_block;
>  	struct buffer_head *obh;
>  
>  	num_blks = count_tags(journal, bh);
> @@ -340,7 +338,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
>  		err = jread(&obh, journal, io_block);
>  		if (err) {
>  			printk(KERN_ERR "JBD: IO error %d recovering block "
> -				"%lu in log\n", err, io_block);
> +				"%llu in log\n", err, io_block);
>  			return 1;
>  		} else {
>  			*crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
> @@ -355,7 +353,7 @@ static int do_one_pass(journal_t *journal,
>  			struct recovery_info *info, enum passtype pass)
>  {
>  	unsigned int		first_commit_ID, next_commit_ID;
> -	unsigned long		next_log_block;
> +	unsigned long long	next_log_block;
>  	int			err, success = 0;
>  	journal_superblock_t *	sb;
>  	journal_header_t *	tmp;
> @@ -485,7 +483,7 @@ static int do_one_pass(journal_t *journal,
>  			tagp = &bh->b_data[sizeof(journal_header_t)];
>  			while ((tagp - bh->b_data + tag_bytes)
>  			       <= journal->j_blocksize) {
> -				unsigned long io_block;
> +				unsigned long long io_block;
>  
>  				tag = (journal_block_tag_t *) tagp;
>  				flags = be32_to_cpu(tag->t_flags);
> @@ -499,13 +497,14 @@ static int do_one_pass(journal_t *journal,
>  					success = err;
>  					printk (KERN_ERR
>  						"JBD: IO error %d recovering "
> -						"block %ld in log\n",
> +						"block %llu in log\n",
>  						err, io_block);
>  				} else {
> -					unsigned long blocknr;
> +					unsigned long long blocknr;
>  
>  					J_ASSERT(obh != NULL);
> -					blocknr = be32_to_cpu(tag->t_blocknr);
> +					blocknr = read_tag_block(tag_bytes,
> +								 tag);
>  
>  					/* If the block has been
>  					 * revoked, then we're all done
> @@ -733,7 +732,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
>  		record_len = 8;
>  
>  	while (offset < max) {
> -		unsigned long blocknr;
> +		unsigned long long blocknr;
>  		int err;
>  
>  		if (record_len == 4)

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o - May 22, 2012, 5:01 p.m.
On Tue, May 22, 2012 at 10:24:59AM -0500, Eric Sandeen wrote:
> 
> How do you decide between using "unsigned long long" and "blk64_t" below?
> 

Consistency more than anything else.  At least at one point
e2fsck/recovery.c was supposed to be identical to the kernel's
fs/jbd2/recovery.c, and so that meant we tried to use avoid using both
kernel-specific idioms that would be hard to replicate in userspace
(but we do quite a bit of that; see e2fsck/jfs_user.h) as well as
userspace-specific idioms/typedefs that don't work well in the kernel
context.  Over time the two have drifted apart, and a good future
project would be to try to bring them closer together so that bugs in
one are more likely to be correctly fixed in the other.

The other headache is with %llu in printf strings.  One of the
problems with using blk64_t is that we don't know whether it's an
unsigned int or an unsigned long or an unsigned long long, and that
causes all sorts of warnings with respect to printf format strings.
We *can* fix it via a cast to an unsigned long long and using %llu,
but it makes the code quite ugly.

So it's a bit simpler to use %llu and just use unsigned long long,
since we don't support any platform where long long is 128 bits, and
on pretty much all Linux systems, long long is pretty consistently 64
bits.  I wouldn't do this on anything that was an on-disk
representation, of course, but in this case I figured it was easier
just to use an "unsigned long long" in the struct buffer_head in
jfs_user.h, and then I made everything consistent with that.

	    	       	    	       		  - Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger - May 22, 2012, 5:09 p.m.
On 2012-05-22, at 9:24 AM, Eric Sandeen wrote:
> On 5/21/12 8:42 PM, Theodore Ts'o wrote:
>> I just found what a somewhat disturbing bug which is still present in
>> 1.42.3 --- namely, that the journal replay code in e2fsck was not
>> properly handling 64-bit file systems correctly, by dropping the high
>> bits of the block number from the jbd2 descriptor blocks.

Hmm, we did have a bug report that would be explained by this, but it wasn't easily reproduced (dependent on metadata blocks beyond 16TB in
hindsight) and only on a test filesystem, so I hadn't had a chance to
dig into it yet.

We did do a 128TB full-filesystem data write + verification + e2fsck,
but I guess that didn't validate journal recovery.  Most filesystems
only have a fraction of data beyond 16TB, and the allocators prefer
to locate inodes/blocks at the beginning of the filesystem, so it
hasn't shown up in real usage very much.

Good catch.

>> I have not had a chance to fully test to make sure we don't have other
>> bugs hiding in the 64-bit code, but it's clear no one else has had a
>> lot of time to test it either -- at least not to the extent of doing
>> power fail testing of > 16TB file systems!   :-(
> 
> Whoops.  :(
> 
> How do you decide between using "unsigned long long" and "blk64_t" below?
> 
> If they'd all been blk_t's they'd have been easier to spot, if anyone went looking, I suppose.

Makes sense.  Better to be consistent with these things, so that the
compiler and users (grep, etc) can find type mismatches and such.

>> 							- Ted
>> 
>> From 3b693d0b03569795d04920a04a0a21e5f64ffedc Mon Sep 17 00:00:00 2001
>> From: Theodore Ts'o <tytso@mit.edu>
>> Date: Mon, 21 May 2012 21:30:45 -0400
>> Subject: [PATCH] e2fsck: fix 64-bit journal support
>> 
>> 64-bit journal support was broken; we weren't using the high bits from
>> the journal descriptor blocks!  We were also using "unsigned long" for
>> the journal block numbers, which would be a problem on 32-bit systems.
>> 
>> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
>> ---
>> e2fsck/jfs_user.h |    4 ++--
>> e2fsck/journal.c  |   33 +++++++++++++++++----------------
>> e2fsck/recovery.c |   25 ++++++++++++-------------
>> 3 files changed, 31 insertions(+), 31 deletions(-)
>> 
>> diff --git a/e2fsck/jfs_user.h b/e2fsck/jfs_user.h
>> index 9e33306..92f8ae2 100644
>> --- a/e2fsck/jfs_user.h
>> +++ b/e2fsck/jfs_user.h
>> @@ -18,7 +18,7 @@ struct buffer_head {
>> 	e2fsck_t	b_ctx;
>> 	io_channel 	b_io;
>> 	int	 	b_size;
>> -	blk_t	 	b_blocknr;
>> +	unsigned long long b_blocknr;
>> 	int	 	b_dirty;
>> 	int	 	b_uptodate;
>> 	int	 	b_err;
>> @@ -121,7 +121,7 @@ _INLINE_ size_t journal_tag_bytes(journal_t *journal)
>> /*
>>  * Kernel compatibility functions are defined in journal.c
>>  */
>> -int journal_bmap(journal_t *journal, blk64_t block, unsigned long *phys);
>> +int journal_bmap(journal_t *journal, blk64_t block, unsigned long long *phys);
>> struct buffer_head *getblk(kdev_t ctx, blk64_t blocknr, int blocksize);
>> void sync_blockdev(kdev_t kdev);
>> void ll_rw_block(int rw, int dummy, struct buffer_head *bh[]);
>> diff --git a/e2fsck/journal.c b/e2fsck/journal.c
>> index 915b8bb..bada028 100644
>> --- a/e2fsck/journal.c
>> +++ b/e2fsck/journal.c
>> @@ -44,7 +44,7 @@ static int bh_count = 0;
>>  * to use the recovery.c file virtually unchanged from the kernel, so we
>>  * don't have to do much to keep kernel and user recovery in sync.
>>  */
>> -int journal_bmap(journal_t *journal, blk64_t block, unsigned long *phys)
>> +int journal_bmap(journal_t *journal, blk64_t block, unsigned long long *phys)
>> {
>> #ifdef USE_INODE_IO
>> 	*phys = block;
>> @@ -80,8 +80,8 @@ struct buffer_head *getblk(kdev_t kdev, blk64_t blocknr, int blocksize)
>> 	if (journal_enable_debug >= 3)
>> 		bh_count++;
>> #endif
>> -	jfs_debug(4, "getblk for block %lu (%d bytes)(total %d)\n",
>> -		  (unsigned long) blocknr, blocksize, bh_count);
>> +	jfs_debug(4, "getblk for block %llu (%d bytes)(total %d)\n",
>> +		  (unsigned long long) blocknr, blocksize, bh_count);
>> 
>> 	bh->b_ctx = kdev->k_ctx;
>> 	if (kdev->k_dev == K_DEV_FS)
>> @@ -114,38 +114,39 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhp[])
>> 	for (; nr > 0; --nr) {
>> 		bh = *bhp++;
>> 		if (rw == READ && !bh->b_uptodate) {
>> -			jfs_debug(3, "reading block %lu/%p\n",
>> -				  (unsigned long) bh->b_blocknr, (void *) bh);
>> +			jfs_debug(3, "reading block %llu/%p\n",
>> +				  bh->b_blocknr, (void *) bh);
>> 			retval = io_channel_read_blk64(bh->b_io,
>> 						     bh->b_blocknr,
>> 						     1, bh->b_data);
>> 			if (retval) {
>> 				com_err(bh->b_ctx->device_name, retval,
>> -					"while reading block %lu\n",
>> -					(unsigned long) bh->b_blocknr);
>> +					"while reading block %llu\n",
>> +					bh->b_blocknr);
>> 				bh->b_err = retval;
>> 				continue;
>> 			}
>> 			bh->b_uptodate = 1;
>> 		} else if (rw == WRITE && bh->b_dirty) {
>> -			jfs_debug(3, "writing block %lu/%p\n",
>> -				  (unsigned long) bh->b_blocknr, (void *) bh);
>> +			jfs_debug(3, "writing block %llu/%p\n",
>> +				  bh->b_blocknr,
>> +				  (void *) bh);
>> 			retval = io_channel_write_blk64(bh->b_io,
>> 						      bh->b_blocknr,
>> 						      1, bh->b_data);
>> 			if (retval) {
>> 				com_err(bh->b_ctx->device_name, retval,
>> -					"while writing block %lu\n",
>> -					(unsigned long) bh->b_blocknr);
>> +					"while writing block %llu\n",
>> +					bh->b_blocknr);
>> 				bh->b_err = retval;
>> 				continue;
>> 			}
>> 			bh->b_dirty = 0;
>> 			bh->b_uptodate = 1;
>> 		} else {
>> -			jfs_debug(3, "no-op %s for block %lu\n",
>> +			jfs_debug(3, "no-op %s for block %llu\n",
>> 				  rw == READ ? "read" : "write",
>> -				  (unsigned long) bh->b_blocknr);
>> +				  bh->b_blocknr);
>> 		}
>> 	}
>> }
>> @@ -164,8 +165,8 @@ void brelse(struct buffer_head *bh)
>> {
>> 	if (bh->b_dirty)
>> 		ll_rw_block(WRITE, 1, &bh);
>> -	jfs_debug(3, "freeing block %lu/%p (total %d)\n",
>> -		  (unsigned long) bh->b_blocknr, (void *) bh, --bh_count);
>> +	jfs_debug(3, "freeing block %llu/%p (total %d)\n",
>> +		  bh->b_blocknr, (void *) bh, --bh_count);
>> 	ext2fs_free_mem(&bh);
>> }
>> 
>> @@ -237,7 +238,7 @@ static errcode_t e2fsck_get_journal(e2fsck_t ctx, journal_t **ret_journal)
>> 	journal_t		*journal = NULL;
>> 	errcode_t		retval = 0;
>> 	io_manager		io_ptr = 0;
>> -	unsigned long		start = 0;
>> +	unsigned long long	start = 0;
>> 	int			ext_journal = 0;
>> 	int			tried_backup_jnl = 0;
>> 
>> diff --git a/e2fsck/recovery.c b/e2fsck/recovery.c
>> index b669941..e94ef4e 100644
>> --- a/e2fsck/recovery.c
>> +++ b/e2fsck/recovery.c
>> @@ -71,7 +71,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
>> {
>> 	int err;
>> 	unsigned int max, nbufs, next;
>> -	unsigned long blocknr;
>> +	unsigned long long blocknr;
>> 	struct buffer_head *bh;
>> 
>> 	struct buffer_head * bufs[MAXBUF];
>> @@ -133,7 +133,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
>> 		 unsigned int offset)
>> {
>> 	int err;
>> -	unsigned long blocknr;
>> +	unsigned long long blocknr;
>> 	struct buffer_head *bh;
>> 
>> 	*bhp = NULL;
>> @@ -309,7 +309,6 @@ int journal_skip_recovery(journal_t *journal)
>> 	return err;
>> }
>> 
>> -#if 0
>> static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
>> {
>> 	unsigned long long block = be32_to_cpu(tag->t_blocknr);
>> @@ -317,17 +316,16 @@ static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag
>> 		block |= (__u64)be32_to_cpu(tag->t_blocknr_high) << 32;
>> 	return block;
>> }
>> -#endif
>> 
>> /*
>>  * calc_chksums calculates the checksums for the blocks described in the
>>  * descriptor block.
>>  */
>> static int calc_chksums(journal_t *journal, struct buffer_head *bh,
>> -			unsigned long *next_log_block, __u32 *crc32_sum)
>> +			unsigned long long *next_log_block, __u32 *crc32_sum)
>> {
>> 	int i, num_blks, err;
>> -	unsigned long io_block;
>> +	unsigned long long io_block;
>> 	struct buffer_head *obh;
>> 
>> 	num_blks = count_tags(journal, bh);
>> @@ -340,7 +338,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
>> 		err = jread(&obh, journal, io_block);
>> 		if (err) {
>> 			printk(KERN_ERR "JBD: IO error %d recovering block "
>> -				"%lu in log\n", err, io_block);
>> +				"%llu in log\n", err, io_block);
>> 			return 1;
>> 		} else {
>> 			*crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
>> @@ -355,7 +353,7 @@ static int do_one_pass(journal_t *journal,
>> 			struct recovery_info *info, enum passtype pass)
>> {
>> 	unsigned int		first_commit_ID, next_commit_ID;
>> -	unsigned long		next_log_block;
>> +	unsigned long long	next_log_block;
>> 	int			err, success = 0;
>> 	journal_superblock_t *	sb;
>> 	journal_header_t *	tmp;
>> @@ -485,7 +483,7 @@ static int do_one_pass(journal_t *journal,
>> 			tagp = &bh->b_data[sizeof(journal_header_t)];
>> 			while ((tagp - bh->b_data + tag_bytes)
>> 			       <= journal->j_blocksize) {
>> -				unsigned long io_block;
>> +				unsigned long long io_block;
>> 
>> 				tag = (journal_block_tag_t *) tagp;
>> 				flags = be32_to_cpu(tag->t_flags);
>> @@ -499,13 +497,14 @@ static int do_one_pass(journal_t *journal,
>> 					success = err;
>> 					printk (KERN_ERR
>> 						"JBD: IO error %d recovering "
>> -						"block %ld in log\n",
>> +						"block %llu in log\n",
>> 						err, io_block);
>> 				} else {
>> -					unsigned long blocknr;
>> +					unsigned long long blocknr;
>> 
>> 					J_ASSERT(obh != NULL);
>> -					blocknr = be32_to_cpu(tag->t_blocknr);
>> +					blocknr = read_tag_block(tag_bytes,
>> +								 tag);
>> 
>> 					/* If the block has been
>> 					 * revoked, then we're all done
>> @@ -733,7 +732,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
>> 		record_len = 8;
>> 
>> 	while (offset < max) {
>> -		unsigned long blocknr;
>> +		unsigned long long blocknr;
>> 		int err;
>> 
>> 		if (record_len == 4)
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Cheers, Andreas





--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger - May 24, 2012, 1 a.m.
Ted, were you going to push this patch?  I didn't see it in the maint branch where the other post-1.42.3 patches were. 

Cheers, Andreas

On 2012-05-21, at 19:42, "Theodore Ts'o" <tytso@mit.edu> wrote:

> 
> I just found what a somewhat disturbing bug which is still present in
> 1.42.3 --- namely, that the journal replay code in e2fsck was not
> properly handling 64-bit file systems correctly, by dropping the high
> bits of the block number from the jbd2 descriptor blocks.
> 
> I have not had a chance to fully test to make sure we don't have other
> bugs hiding in the 64-bit code, but it's clear no one else has had a lot
> of time to test it either -- at least not to the extent of doing power
> fail testing of > 16TB file systems!   :-(
> 
>                            - Ted
> 
> From 3b693d0b03569795d04920a04a0a21e5f64ffedc Mon Sep 17 00:00:00 2001
> From: Theodore Ts'o <tytso@mit.edu>
> Date: Mon, 21 May 2012 21:30:45 -0400
> Subject: [PATCH] e2fsck: fix 64-bit journal support
> 
> 64-bit journal support was broken; we weren't using the high bits from
> the journal descriptor blocks!  We were also using "unsigned long" for
> the journal block numbers, which would be a problem on 32-bit systems.
> 
> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
> ---
> e2fsck/jfs_user.h |    4 ++--
> e2fsck/journal.c  |   33 +++++++++++++++++----------------
> e2fsck/recovery.c |   25 ++++++++++++-------------
> 3 files changed, 31 insertions(+), 31 deletions(-)
> 
> diff --git a/e2fsck/jfs_user.h b/e2fsck/jfs_user.h
> index 9e33306..92f8ae2 100644
> --- a/e2fsck/jfs_user.h
> +++ b/e2fsck/jfs_user.h
> @@ -18,7 +18,7 @@ struct buffer_head {
>    e2fsck_t    b_ctx;
>    io_channel    b_io;
>    int        b_size;
> -    blk_t        b_blocknr;
> +    unsigned long long b_blocknr;
>    int        b_dirty;
>    int        b_uptodate;
>    int        b_err;
> @@ -121,7 +121,7 @@ _INLINE_ size_t journal_tag_bytes(journal_t *journal)
> /*
>  * Kernel compatibility functions are defined in journal.c
>  */
> -int journal_bmap(journal_t *journal, blk64_t block, unsigned long *phys);
> +int journal_bmap(journal_t *journal, blk64_t block, unsigned long long *phys);
> struct buffer_head *getblk(kdev_t ctx, blk64_t blocknr, int blocksize);
> void sync_blockdev(kdev_t kdev);
> void ll_rw_block(int rw, int dummy, struct buffer_head *bh[]);
> diff --git a/e2fsck/journal.c b/e2fsck/journal.c
> index 915b8bb..bada028 100644
> --- a/e2fsck/journal.c
> +++ b/e2fsck/journal.c
> @@ -44,7 +44,7 @@ static int bh_count = 0;
>  * to use the recovery.c file virtually unchanged from the kernel, so we
>  * don't have to do much to keep kernel and user recovery in sync.
>  */
> -int journal_bmap(journal_t *journal, blk64_t block, unsigned long *phys)
> +int journal_bmap(journal_t *journal, blk64_t block, unsigned long long *phys)
> {
> #ifdef USE_INODE_IO
>    *phys = block;
> @@ -80,8 +80,8 @@ struct buffer_head *getblk(kdev_t kdev, blk64_t blocknr, int blocksize)
>    if (journal_enable_debug >= 3)
>        bh_count++;
> #endif
> -    jfs_debug(4, "getblk for block %lu (%d bytes)(total %d)\n",
> -          (unsigned long) blocknr, blocksize, bh_count);
> +    jfs_debug(4, "getblk for block %llu (%d bytes)(total %d)\n",
> +          (unsigned long long) blocknr, blocksize, bh_count);
> 
>    bh->b_ctx = kdev->k_ctx;
>    if (kdev->k_dev == K_DEV_FS)
> @@ -114,38 +114,39 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhp[])
>    for (; nr > 0; --nr) {
>        bh = *bhp++;
>        if (rw == READ && !bh->b_uptodate) {
> -            jfs_debug(3, "reading block %lu/%p\n",
> -                  (unsigned long) bh->b_blocknr, (void *) bh);
> +            jfs_debug(3, "reading block %llu/%p\n",
> +                  bh->b_blocknr, (void *) bh);
>            retval = io_channel_read_blk64(bh->b_io,
>                             bh->b_blocknr,
>                             1, bh->b_data);
>            if (retval) {
>                com_err(bh->b_ctx->device_name, retval,
> -                    "while reading block %lu\n",
> -                    (unsigned long) bh->b_blocknr);
> +                    "while reading block %llu\n",
> +                    bh->b_blocknr);
>                bh->b_err = retval;
>                continue;
>            }
>            bh->b_uptodate = 1;
>        } else if (rw == WRITE && bh->b_dirty) {
> -            jfs_debug(3, "writing block %lu/%p\n",
> -                  (unsigned long) bh->b_blocknr, (void *) bh);
> +            jfs_debug(3, "writing block %llu/%p\n",
> +                  bh->b_blocknr,
> +                  (void *) bh);
>            retval = io_channel_write_blk64(bh->b_io,
>                              bh->b_blocknr,
>                              1, bh->b_data);
>            if (retval) {
>                com_err(bh->b_ctx->device_name, retval,
> -                    "while writing block %lu\n",
> -                    (unsigned long) bh->b_blocknr);
> +                    "while writing block %llu\n",
> +                    bh->b_blocknr);
>                bh->b_err = retval;
>                continue;
>            }
>            bh->b_dirty = 0;
>            bh->b_uptodate = 1;
>        } else {
> -            jfs_debug(3, "no-op %s for block %lu\n",
> +            jfs_debug(3, "no-op %s for block %llu\n",
>                  rw == READ ? "read" : "write",
> -                  (unsigned long) bh->b_blocknr);
> +                  bh->b_blocknr);
>        }
>    }
> }
> @@ -164,8 +165,8 @@ void brelse(struct buffer_head *bh)
> {
>    if (bh->b_dirty)
>        ll_rw_block(WRITE, 1, &bh);
> -    jfs_debug(3, "freeing block %lu/%p (total %d)\n",
> -          (unsigned long) bh->b_blocknr, (void *) bh, --bh_count);
> +    jfs_debug(3, "freeing block %llu/%p (total %d)\n",
> +          bh->b_blocknr, (void *) bh, --bh_count);
>    ext2fs_free_mem(&bh);
> }
> 
> @@ -237,7 +238,7 @@ static errcode_t e2fsck_get_journal(e2fsck_t ctx, journal_t **ret_journal)
>    journal_t        *journal = NULL;
>    errcode_t        retval = 0;
>    io_manager        io_ptr = 0;
> -    unsigned long        start = 0;
> +    unsigned long long    start = 0;
>    int            ext_journal = 0;
>    int            tried_backup_jnl = 0;
> 
> diff --git a/e2fsck/recovery.c b/e2fsck/recovery.c
> index b669941..e94ef4e 100644
> --- a/e2fsck/recovery.c
> +++ b/e2fsck/recovery.c
> @@ -71,7 +71,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
> {
>    int err;
>    unsigned int max, nbufs, next;
> -    unsigned long blocknr;
> +    unsigned long long blocknr;
>    struct buffer_head *bh;
> 
>    struct buffer_head * bufs[MAXBUF];
> @@ -133,7 +133,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
>         unsigned int offset)
> {
>    int err;
> -    unsigned long blocknr;
> +    unsigned long long blocknr;
>    struct buffer_head *bh;
> 
>    *bhp = NULL;
> @@ -309,7 +309,6 @@ int journal_skip_recovery(journal_t *journal)
>    return err;
> }
> 
> -#if 0
> static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
> {
>    unsigned long long block = be32_to_cpu(tag->t_blocknr);
> @@ -317,17 +316,16 @@ static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag
>        block |= (__u64)be32_to_cpu(tag->t_blocknr_high) << 32;
>    return block;
> }
> -#endif
> 
> /*
>  * calc_chksums calculates the checksums for the blocks described in the
>  * descriptor block.
>  */
> static int calc_chksums(journal_t *journal, struct buffer_head *bh,
> -            unsigned long *next_log_block, __u32 *crc32_sum)
> +            unsigned long long *next_log_block, __u32 *crc32_sum)
> {
>    int i, num_blks, err;
> -    unsigned long io_block;
> +    unsigned long long io_block;
>    struct buffer_head *obh;
> 
>    num_blks = count_tags(journal, bh);
> @@ -340,7 +338,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
>        err = jread(&obh, journal, io_block);
>        if (err) {
>            printk(KERN_ERR "JBD: IO error %d recovering block "
> -                "%lu in log\n", err, io_block);
> +                "%llu in log\n", err, io_block);
>            return 1;
>        } else {
>            *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
> @@ -355,7 +353,7 @@ static int do_one_pass(journal_t *journal,
>            struct recovery_info *info, enum passtype pass)
> {
>    unsigned int        first_commit_ID, next_commit_ID;
> -    unsigned long        next_log_block;
> +    unsigned long long    next_log_block;
>    int            err, success = 0;
>    journal_superblock_t *    sb;
>    journal_header_t *    tmp;
> @@ -485,7 +483,7 @@ static int do_one_pass(journal_t *journal,
>            tagp = &bh->b_data[sizeof(journal_header_t)];
>            while ((tagp - bh->b_data + tag_bytes)
>                   <= journal->j_blocksize) {
> -                unsigned long io_block;
> +                unsigned long long io_block;
> 
>                tag = (journal_block_tag_t *) tagp;
>                flags = be32_to_cpu(tag->t_flags);
> @@ -499,13 +497,14 @@ static int do_one_pass(journal_t *journal,
>                    success = err;
>                    printk (KERN_ERR
>                        "JBD: IO error %d recovering "
> -                        "block %ld in log\n",
> +                        "block %llu in log\n",
>                        err, io_block);
>                } else {
> -                    unsigned long blocknr;
> +                    unsigned long long blocknr;
> 
>                    J_ASSERT(obh != NULL);
> -                    blocknr = be32_to_cpu(tag->t_blocknr);
> +                    blocknr = read_tag_block(tag_bytes,
> +                                 tag);
> 
>                    /* If the block has been
>                     * revoked, then we're all done
> @@ -733,7 +732,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
>        record_len = 8;
> 
>    while (offset < max) {
> -        unsigned long blocknr;
> +        unsigned long long blocknr;
>        int err;
> 
>        if (record_len == 4)
> -- 
> 1.7.10.rc3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/e2fsck/jfs_user.h b/e2fsck/jfs_user.h
index 9e33306..92f8ae2 100644
--- a/e2fsck/jfs_user.h
+++ b/e2fsck/jfs_user.h
@@ -18,7 +18,7 @@  struct buffer_head {
 	e2fsck_t	b_ctx;
 	io_channel 	b_io;
 	int	 	b_size;
-	blk_t	 	b_blocknr;
+	unsigned long long b_blocknr;
 	int	 	b_dirty;
 	int	 	b_uptodate;
 	int	 	b_err;
@@ -121,7 +121,7 @@  _INLINE_ size_t journal_tag_bytes(journal_t *journal)
 /*
  * Kernel compatibility functions are defined in journal.c
  */
-int journal_bmap(journal_t *journal, blk64_t block, unsigned long *phys);
+int journal_bmap(journal_t *journal, blk64_t block, unsigned long long *phys);
 struct buffer_head *getblk(kdev_t ctx, blk64_t blocknr, int blocksize);
 void sync_blockdev(kdev_t kdev);
 void ll_rw_block(int rw, int dummy, struct buffer_head *bh[]);
diff --git a/e2fsck/journal.c b/e2fsck/journal.c
index 915b8bb..bada028 100644
--- a/e2fsck/journal.c
+++ b/e2fsck/journal.c
@@ -44,7 +44,7 @@  static int bh_count = 0;
  * to use the recovery.c file virtually unchanged from the kernel, so we
  * don't have to do much to keep kernel and user recovery in sync.
  */
-int journal_bmap(journal_t *journal, blk64_t block, unsigned long *phys)
+int journal_bmap(journal_t *journal, blk64_t block, unsigned long long *phys)
 {
 #ifdef USE_INODE_IO
 	*phys = block;
@@ -80,8 +80,8 @@  struct buffer_head *getblk(kdev_t kdev, blk64_t blocknr, int blocksize)
 	if (journal_enable_debug >= 3)
 		bh_count++;
 #endif
-	jfs_debug(4, "getblk for block %lu (%d bytes)(total %d)\n",
-		  (unsigned long) blocknr, blocksize, bh_count);
+	jfs_debug(4, "getblk for block %llu (%d bytes)(total %d)\n",
+		  (unsigned long long) blocknr, blocksize, bh_count);
 
 	bh->b_ctx = kdev->k_ctx;
 	if (kdev->k_dev == K_DEV_FS)
@@ -114,38 +114,39 @@  void ll_rw_block(int rw, int nr, struct buffer_head *bhp[])
 	for (; nr > 0; --nr) {
 		bh = *bhp++;
 		if (rw == READ && !bh->b_uptodate) {
-			jfs_debug(3, "reading block %lu/%p\n",
-				  (unsigned long) bh->b_blocknr, (void *) bh);
+			jfs_debug(3, "reading block %llu/%p\n",
+				  bh->b_blocknr, (void *) bh);
 			retval = io_channel_read_blk64(bh->b_io,
 						     bh->b_blocknr,
 						     1, bh->b_data);
 			if (retval) {
 				com_err(bh->b_ctx->device_name, retval,
-					"while reading block %lu\n",
-					(unsigned long) bh->b_blocknr);
+					"while reading block %llu\n",
+					bh->b_blocknr);
 				bh->b_err = retval;
 				continue;
 			}
 			bh->b_uptodate = 1;
 		} else if (rw == WRITE && bh->b_dirty) {
-			jfs_debug(3, "writing block %lu/%p\n",
-				  (unsigned long) bh->b_blocknr, (void *) bh);
+			jfs_debug(3, "writing block %llu/%p\n",
+				  bh->b_blocknr,
+				  (void *) bh);
 			retval = io_channel_write_blk64(bh->b_io,
 						      bh->b_blocknr,
 						      1, bh->b_data);
 			if (retval) {
 				com_err(bh->b_ctx->device_name, retval,
-					"while writing block %lu\n",
-					(unsigned long) bh->b_blocknr);
+					"while writing block %llu\n",
+					bh->b_blocknr);
 				bh->b_err = retval;
 				continue;
 			}
 			bh->b_dirty = 0;
 			bh->b_uptodate = 1;
 		} else {
-			jfs_debug(3, "no-op %s for block %lu\n",
+			jfs_debug(3, "no-op %s for block %llu\n",
 				  rw == READ ? "read" : "write",
-				  (unsigned long) bh->b_blocknr);
+				  bh->b_blocknr);
 		}
 	}
 }
@@ -164,8 +165,8 @@  void brelse(struct buffer_head *bh)
 {
 	if (bh->b_dirty)
 		ll_rw_block(WRITE, 1, &bh);
-	jfs_debug(3, "freeing block %lu/%p (total %d)\n",
-		  (unsigned long) bh->b_blocknr, (void *) bh, --bh_count);
+	jfs_debug(3, "freeing block %llu/%p (total %d)\n",
+		  bh->b_blocknr, (void *) bh, --bh_count);
 	ext2fs_free_mem(&bh);
 }
 
@@ -237,7 +238,7 @@  static errcode_t e2fsck_get_journal(e2fsck_t ctx, journal_t **ret_journal)
 	journal_t		*journal = NULL;
 	errcode_t		retval = 0;
 	io_manager		io_ptr = 0;
-	unsigned long		start = 0;
+	unsigned long long	start = 0;
 	int			ext_journal = 0;
 	int			tried_backup_jnl = 0;
 
diff --git a/e2fsck/recovery.c b/e2fsck/recovery.c
index b669941..e94ef4e 100644
--- a/e2fsck/recovery.c
+++ b/e2fsck/recovery.c
@@ -71,7 +71,7 @@  static int do_readahead(journal_t *journal, unsigned int start)
 {
 	int err;
 	unsigned int max, nbufs, next;
-	unsigned long blocknr;
+	unsigned long long blocknr;
 	struct buffer_head *bh;
 
 	struct buffer_head * bufs[MAXBUF];
@@ -133,7 +133,7 @@  static int jread(struct buffer_head **bhp, journal_t *journal,
 		 unsigned int offset)
 {
 	int err;
-	unsigned long blocknr;
+	unsigned long long blocknr;
 	struct buffer_head *bh;
 
 	*bhp = NULL;
@@ -309,7 +309,6 @@  int journal_skip_recovery(journal_t *journal)
 	return err;
 }
 
-#if 0
 static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
 {
 	unsigned long long block = be32_to_cpu(tag->t_blocknr);
@@ -317,17 +316,16 @@  static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag
 		block |= (__u64)be32_to_cpu(tag->t_blocknr_high) << 32;
 	return block;
 }
-#endif
 
 /*
  * calc_chksums calculates the checksums for the blocks described in the
  * descriptor block.
  */
 static int calc_chksums(journal_t *journal, struct buffer_head *bh,
-			unsigned long *next_log_block, __u32 *crc32_sum)
+			unsigned long long *next_log_block, __u32 *crc32_sum)
 {
 	int i, num_blks, err;
-	unsigned long io_block;
+	unsigned long long io_block;
 	struct buffer_head *obh;
 
 	num_blks = count_tags(journal, bh);
@@ -340,7 +338,7 @@  static int calc_chksums(journal_t *journal, struct buffer_head *bh,
 		err = jread(&obh, journal, io_block);
 		if (err) {
 			printk(KERN_ERR "JBD: IO error %d recovering block "
-				"%lu in log\n", err, io_block);
+				"%llu in log\n", err, io_block);
 			return 1;
 		} else {
 			*crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
@@ -355,7 +353,7 @@  static int do_one_pass(journal_t *journal,
 			struct recovery_info *info, enum passtype pass)
 {
 	unsigned int		first_commit_ID, next_commit_ID;
-	unsigned long		next_log_block;
+	unsigned long long	next_log_block;
 	int			err, success = 0;
 	journal_superblock_t *	sb;
 	journal_header_t *	tmp;
@@ -485,7 +483,7 @@  static int do_one_pass(journal_t *journal,
 			tagp = &bh->b_data[sizeof(journal_header_t)];
 			while ((tagp - bh->b_data + tag_bytes)
 			       <= journal->j_blocksize) {
-				unsigned long io_block;
+				unsigned long long io_block;
 
 				tag = (journal_block_tag_t *) tagp;
 				flags = be32_to_cpu(tag->t_flags);
@@ -499,13 +497,14 @@  static int do_one_pass(journal_t *journal,
 					success = err;
 					printk (KERN_ERR
 						"JBD: IO error %d recovering "
-						"block %ld in log\n",
+						"block %llu in log\n",
 						err, io_block);
 				} else {
-					unsigned long blocknr;
+					unsigned long long blocknr;
 
 					J_ASSERT(obh != NULL);
-					blocknr = be32_to_cpu(tag->t_blocknr);
+					blocknr = read_tag_block(tag_bytes,
+								 tag);
 
 					/* If the block has been
 					 * revoked, then we're all done
@@ -733,7 +732,7 @@  static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
 		record_len = 8;
 
 	while (offset < max) {
-		unsigned long blocknr;
+		unsigned long long blocknr;
 		int err;
 
 		if (record_len == 4)