diff mbox series

[6/7] e2fsck: main fast commit replay handler

Message ID 20200319233433.117144-7-harshadshirwadkar@gmail.com
State New
Headers show
Series e2fsck: fast commit recovery path e2fsck changes | expand

Commit Message

harshad shirwadkar March 19, 2020, 11:34 p.m. UTC
Add main ext4 fast commit replay handler that handles replayed fast
commit blocks.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
 e2fsck/e2fsck.h      |   9 +
 e2fsck/journal.c     | 491 ++++++++++++++++++++++++++++++++++++++++++-
 lib/ext2fs/ext2_fs.h |  46 ++++
 3 files changed, 545 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/e2fsck/e2fsck.h b/e2fsck/e2fsck.h
index 68f7a249..8ea87ac5 100644
--- a/e2fsck/e2fsck.h
+++ b/e2fsck/e2fsck.h
@@ -226,6 +226,12 @@  typedef struct e2fsck_struct *e2fsck_t;
 
 #define MAX_EXTENT_DEPTH_COUNT 5
 
+struct e2fsck_fc_replay_state {
+	int fc_replay_error;
+	int fc_replay_expected_off;
+	int fc_num_blks;
+};
+
 struct e2fsck_struct {
 	ext2_filsys fs;
 	const char *program_name;
@@ -418,6 +424,9 @@  struct e2fsck_struct {
 
 	/* Undo file */
 	char *undo_file;
+
+	/* Fast commit replay stuff */
+	struct e2fsck_fc_replay_state fc_replay_state;
 };
 
 /* Data structures to evaluate whether an extent tree needs rebuilding. */
diff --git a/e2fsck/journal.c b/e2fsck/journal.c
index 7d9f1b40..97fb3c24 100644
--- a/e2fsck/journal.c
+++ b/e2fsck/journal.c
@@ -278,6 +278,485 @@  static int process_journal_block(ext2_filsys fs,
 	return 0;
 }
 
+static int ext4_journal_fc_replay_scan(journal_t *j, struct buffer_head *bh,
+				       int off)
+{
+	e2fsck_t ctx = j->j_fs_dev->k_ctx;
+	struct e2fsck_fc_replay_state *state;
+	struct ext4_fc_commit_hdr *fc_hdr;
+	struct ext4_fc_tl *tl;
+	__u32 csum, old_csum;
+	__u8 *start, *end;
+
+	state = &ctx->fc_replay_state;
+	fc_hdr = (struct ext4_fc_commit_hdr *)
+		  ((__u8 *)bh->b_data + sizeof(journal_header_t));
+
+	start = (__u8 *)fc_hdr;
+	end = (__u8 *)bh->b_data + j->j_blocksize;
+
+	/* Check if we already concluded that this fast commit is not useful */
+	if (state->fc_replay_expected_off && state->fc_replay_error)
+		goto out_err;
+
+	if (le32_to_cpu(fc_hdr->fc_magic) != EXT4_FC_MAGIC) {
+		state->fc_replay_error = -EXT2_ET_BAD_MAGIC;
+		goto out_err;
+	}
+
+	if (off != state->fc_replay_expected_off) {
+		state->fc_replay_error = -EXT2_ET_CORRUPT_JOURNAL_SB;
+		goto out_err;
+	}
+
+	state->fc_replay_expected_off++;
+
+	if (le16_to_cpu(fc_hdr->fc_features)) {
+		state->fc_replay_error = -EXT2_ET_OP_NOT_SUPPORTED;
+		goto out_err;
+	}
+
+	old_csum = fc_hdr->fc_csum;
+	fc_hdr->fc_csum = 0;
+	csum = jbd2_chksum(j, 0, start, end - start);
+	fc_hdr->fc_csum = old_csum;
+
+	if (csum != le32_to_cpu(fc_hdr->fc_csum)) {
+		state->fc_replay_error = -EXT2_ET_BAD_CRC;
+		goto out_err;
+	}
+	state->fc_num_blks++;
+	return 0;
+
+out_err:
+	return state->fc_replay_error;
+}
+
+/* Get length of a particular tlv */
+static int fc_tag_len(struct ext4_fc_tl *tl)
+{
+	return le16_to_cpu(tl->fc_len);
+}
+
+/* Get a pointer to "value" of a tlv */
+static __u8 *fc_tag_val(struct ext4_fc_tl *tl)
+{
+	return (__u8 *)tl + sizeof(*tl);
+}
+
+static int ext4_fc_handle_unlink(ext2_filsys fs, int parent_ino,
+				 const char *dname, int ino)
+{
+	struct ext2_inode inode;
+	int ret;
+
+	ret = ext2fs_unlink(fs, parent_ino, dname, ino, 0);
+	if (ret)
+		return ret;
+
+	ret = ext2fs_read_inode(fs, ino, &inode);
+	if (ret)
+		return ret;
+
+	if (inode.i_links_count > 1) {
+		inode.i_links_count--;
+		ret = ext2fs_write_inode(fs, ino, &inode);
+		if (ret)
+			return ret;
+	} else {
+		memset(&inode, 0, sizeof(inode));
+		ext2fs_write_inode(fs, ino, &inode);
+		ext2fs_unmark_inode_bitmap2(fs->inode_map, ino);
+		ext2fs_mark_ib_dirty(fs);
+	}
+
+	return 0;
+}
+
+static inline int get_fc_hdr_inode_len(ext2_filsys fs,
+				       struct ext4_fc_commit_hdr *fc_hdr)
+{
+	int inode_len = EXT2_GOOD_OLD_INODE_SIZE;
+
+	if (EXT2_INODE_SIZE(fs->super)
+			> EXT2_GOOD_OLD_INODE_SIZE)
+		inode_len +=
+			ext2fs_le16_to_cpu(((struct ext2_inode_large *)
+				(fc_hdr + 1))->i_extra_isize);
+	return inode_len;
+}
+
+static inline struct ext4_fc_tl *get_first_tl(ext2_filsys fs,
+					      struct ext4_fc_commit_hdr *fc_hdr)
+{
+	return (struct ext4_fc_tl *)((__u8 *)fc_hdr +
+				   sizeof(struct ext4_fc_commit_hdr) +
+				   get_fc_hdr_inode_len(fs, fc_hdr));
+}
+
+static inline struct ext4_fc_tl *get_next_tl(struct ext4_fc_tl *tl)
+{
+	return (struct ext4_fc_tl *)((__u8 *)tl +
+					le16_to_cpu(tl->fc_len) +
+					sizeof(*tl));
+}
+
+static inline int num_tls(struct ext4_fc_commit_hdr *fc_hdr)
+{
+	return le16_to_cpu(fc_hdr->fc_num_tlvs);
+}
+
+static int fc_replay_dentries(journal_t *j,
+			struct ext4_fc_commit_hdr *fc_hdr)
+{
+	int inode_len, ret, i;
+	struct ext4_fc_dentry_info *fcd;
+	ext2_filsys fs = j->j_fs_dev->k_ctx->fs;
+	struct ext2_inode *inode;
+	struct ext4_fc_tl *tl;
+	int parent_ino, ino;
+	char *dname;
+
+	inode_len = get_fc_hdr_inode_len(fs, fc_hdr);
+	tl = get_first_tl(fs, fc_hdr);
+	for (i = 0; i < le16_to_cpu(fc_hdr->fc_num_tlvs); i++) {
+		fcd = (struct ext4_fc_dentry_info *)fc_tag_val(tl);
+
+		parent_ino = le32_to_cpu(fcd->fc_parent_ino);
+		ino = le32_to_cpu(fcd->fc_ino);
+		dname = strndup(fcd->fc_dname, fc_tag_len(tl) -
+				sizeof(struct ext4_fc_dentry_info));
+		if (le16_to_cpu(tl->fc_tag) == EXT4_FC_TAG_ADD_DENTRY) {
+			ret = ext2fs_link(fs, parent_ino, dname, ino,
+					  EXT2_FT_REG_FILE);
+			ext2fs_free_mem(&dname);
+			if (ret)
+				return ret;
+			ext2fs_mark_inode_bitmap2(
+				fs->inode_map, ino);
+			ext2fs_mark_ib_dirty(fs);
+		} else if (le16_to_cpu(tl->fc_tag) == EXT4_FC_TAG_DEL_DENTRY) {
+			ret = ext4_fc_handle_unlink(fs, parent_ino, dname, ino);
+			ext2fs_free_mem(&dname);
+			if (ret)
+				return ret;
+		} else if (le16_to_cpu(tl->fc_tag) ==
+				EXT4_FC_TAG_CREAT_DENTRY) {
+			ext2fs_mark_inode_bitmap2(fs->inode_map, ino);
+			ret = ext2fs_link(fs, parent_ino, dname, ino,
+					  EXT2_FT_REG_FILE);
+			if (ret) {
+				ext2fs_free_mem(&dname);
+				return ret;
+			}
+			ext2fs_free_mem(&dname);
+
+			ret = ext2fs_get_mem(inode_len, &inode);
+			if (ret)
+				return ret;
+			ret = ext2fs_read_inode_full(fs, ino, inode, inode_len);
+			if (ret) {
+				ext2fs_free_mem(&inode);
+				return ret;
+			}
+			memcpy(inode, (struct ext2_inode *)(fc_hdr + 1),
+				inode_len);
+			ret = ext2fs_write_inode_full(fs, ino, inode,
+						      inode_len);
+			if (ret) {
+				ext2fs_free_mem(&inode);
+				return ret;
+			}
+			ext2fs_free_mem(&inode);
+			ext2fs_mark_ib_dirty(fs);
+		}
+		tl = get_next_tl(tl);
+	}
+	return 0;
+}
+
+static int ext2fs_add_extent_to_list(struct extent_list *list,
+					struct ext2fs_extent *ex)
+{
+	int ret;
+
+	if (list->count == list->size) {
+		unsigned int new_size = (list->size + NUM_EXTENTS) *
+					sizeof(struct ext2fs_extent);
+		ret = ext2fs_resize_mem(0, new_size, &list->extents);
+		if (ret)
+			return ret;
+		list->size += NUM_EXTENTS;
+	}
+
+	memcpy(list->extents + list->count, ex, sizeof(*ex));
+	list->count++;
+	return 0;
+}
+
+static int ext2fs_del_extent_from_list(struct extent_list *list,
+				       struct ext2fs_extent *del)
+{
+	struct ext2fs_extent extent;
+	int ret, i, j, del_start, del_end, iter_start, iter_end;
+
+	i = 0;
+	del_start = del->e_lblk;
+	del_end = del->e_lblk + del->e_len - 1;
+
+	while (i < list->count) {
+		iter_start = list->extents[i].e_lblk;
+		iter_end = list->extents[i].e_lblk + list->extents[i].e_len - 1;
+
+		if (del_start > iter_end || del_end < iter_start) {
+			i++;
+			continue;
+		} else if (del_start <= iter_start && del_end >= iter_end) {
+			iter_start = iter_end + 1;
+		} else if (iter_start <= del_start && del_end <= iter_end) {
+			extent.e_lblk = del_end + 1;
+			extent.e_len = iter_end - del_end;
+			extent.e_pblk = list->extents[i].e_pblk +
+					extent.e_lblk - iter_start;
+			extent.e_flags =  list->extents[i].e_flags;
+			ret = ext2fs_add_extent_to_list(list, &extent);
+			if (ret)
+				return ret;
+			iter_end = del_start - 1;
+		} else if (del_start >= iter_start && del_start <= iter_end) {
+			iter_end = del_start - 1;
+		} else if (del_end >= iter_start && del_end <= iter_end) {
+			iter_start = del_end + 1;
+		} else {
+			/* Should not come here */
+			exit(FSCK_ERROR);
+		}
+
+		if (iter_start > iter_end) {
+			/*
+			 * If this removal resulted in iter being of zero
+			 * length, remove it right away, and start the next
+			 * iteration at current index.
+			 */
+			for (j = i; j < list->count - 1; j++)
+				list->extents[j] = list->extents[j + 1];
+			list->count--;
+		} else {
+			list->extents[i].e_lblk = iter_start;
+			list->extents[i].e_len = iter_end - iter_start + 1;
+			i++;
+		}
+	}
+
+	return 0;
+}
+
+static void ext3_to_ext2fs_extent(struct ext2fs_extent *to,
+				  struct ext3_extent *from)
+{
+	to->e_pblk = ext2fs_le32_to_cpu(from->ee_start) +
+		((__u64) ext2fs_le16_to_cpu(from->ee_start_hi)
+			<< 32);
+	to->e_lblk = ext2fs_le32_to_cpu(from->ee_block);
+	to->e_len = ext2fs_le16_to_cpu(from->ee_len);
+	to->e_flags |= EXT2_EXTENT_FLAGS_LEAF;
+	if (to->e_len > EXT_INIT_MAX_LEN) {
+		to->e_len -= EXT_INIT_MAX_LEN;
+		to->e_flags |= EXT2_EXTENT_FLAGS_UNINIT;
+	}
+}
+
+static int ex_compar(const void *arg1, const void *arg2)
+{
+	struct ext2fs_extent *ex1 = (struct ext2fs_extent *)arg1;
+	struct ext2fs_extent *ex2 = (struct ext2fs_extent *)arg2;
+
+	if (ex1->e_lblk < ex2->e_lblk)
+		return -1;
+	if (ex1->e_lblk > ex2->e_lblk)
+		return 1;
+	return ex1->e_len - ex2->e_len;
+}
+
+static void sort_and_merge_extents(struct extent_list *list)
+{
+	struct ext2fs_extent *iter;
+	blk64_t ex_end;
+	int i, j;
+
+	if (list->count < 2)
+		return;
+
+	qsort(list->extents, list->count, sizeof(list->extents[0]),
+		ex_compar);
+
+	i = 0;
+	while (i < list->count - 1) {
+		if (list->extents[i].e_lblk + list->extents[i].e_len - 1 <
+			list->extents[i + 1].e_lblk) {
+			i++;
+			continue;
+		}
+		ex_end = MAX(list->extents[i].e_lblk + list->extents[i].e_len,
+			     list->extents[i + 1].e_lblk +
+			     list->extents[i + 1].e_len) - 1;
+		list->extents[i].e_len = ex_end - list->extents[i].e_lblk + 1;
+		for (j = i + 1; j < list->count - 1; j++)
+			list->extents[j] = list->extents[j + 1];
+		list->count--;
+	}
+}
+
+static void mark_blocks_used(ext2_filsys fs, blk64_t pblk, int count)
+{
+	int i = 0;
+
+	for (i = 0; i < count; i++) {
+		if (ext2fs_test_block_bitmap2(fs->block_map, pblk + i))
+			continue;
+		ext2fs_mark_block_bitmap2(fs->block_map, pblk + i);
+	}
+}
+
+static void mark_blocks_free(ext2_filsys fs, blk64_t pblk, int count)
+{
+	int i = 0;
+
+	for (i = 0; i < count; i++) {
+		if (!ext2fs_test_block_bitmap2(fs->block_map, pblk + i))
+			continue;
+		ext2fs_unmark_block_bitmap2(fs->block_map, pblk + i);
+	}
+}
+
+static int ext4_journal_fc_replay_cb(journal_t *journal, struct buffer_head *bh,
+				     enum passtype pass, int off)
+{
+	struct ext4_fc_commit_hdr *fc_hdr;
+	struct ext4_fc_tl *tl;
+	struct ext3_extent *ex;
+	ext2_extent_handle_t handle = 0;
+	int i, j, ret, ino, num_extents;
+	struct ext2_inode *inode;
+	e2fsck_t ctx = journal->j_fs_dev->k_ctx;
+	struct ext2fs_extent extent;
+	struct extent_list extent_list = {0};
+	struct ext4_fc_lrange *lrange;
+	int inode_len;
+	blk64_t pblk;
+
+	if (pass == PASS_SCAN)
+		return ext4_journal_fc_replay_scan(journal, bh, off);
+	else if (pass != PASS_REPLAY)
+		return 0;
+	ctx->fc_replay_state.fc_num_blks--;
+
+	if (ctx->fc_replay_state.fc_replay_error) {
+		jfs_debug("Scan phase detected error. Aborting replay..\n");
+		return ctx->fc_replay_state.fc_replay_error;
+	}
+
+	ret = ext2fs_read_bitmaps(ctx->fs);
+	if (ret)
+		return ret;
+
+	fc_hdr = (struct ext4_fc_commit_hdr *)
+		  ((__u8 *)bh->b_data + sizeof(journal_header_t));
+	inode_len = get_fc_hdr_inode_len(ctx->fs, fc_hdr);
+	ret = fc_replay_dentries(journal, fc_hdr);
+	if (ret)
+		return ret;
+
+	ino = le32_to_cpu(fc_hdr->fc_ino);
+	extent_list.ino = ino;
+	ret = e2fsck_read_extents(ctx, &extent_list);
+	if (ret)
+		return ret;
+
+	tl = get_first_tl(ctx->fs, fc_hdr);
+	for (i = 0; i < num_tls(fc_hdr); i++) {
+		switch (le16_to_cpu(tl->fc_tag)) {
+		case EXT4_FC_TAG_ADD_RANGE:
+			ext3_to_ext2fs_extent(&extent,
+					      (struct ext3_extent *)(tl + 1));
+			ret = ext2fs_add_extent_to_list(&extent_list, &extent);
+			if (ret)
+				goto out;
+			mark_blocks_used(ctx->fs, extent.e_pblk,  extent.e_len);
+			break;
+		case EXT4_FC_TAG_DEL_RANGE:
+			lrange = (struct ext4_fc_lrange *)(tl + 1);
+			extent.e_lblk = ext2fs_le32_to_cpu(lrange->fc_lblk);
+			extent.e_len = ext2fs_le16_to_cpu(lrange->fc_len);
+
+			pblk = 0;
+			for (j = 0; j < extent_list.count; j++) {
+				if (extent.e_lblk >=
+				    extent_list.extents[j].e_lblk &&
+				    extent.e_lblk <
+				    extent_list.extents[j].e_lblk +
+				    extent_list.extents[j].e_len) {
+					pblk = extent_list.extents[j].e_pblk +
+						extent.e_lblk -
+						extent_list.extents[j].e_lblk;
+					break;
+				}
+			}
+			ret = ext2fs_del_extent_from_list(&extent_list,
+							  &extent);
+			if (ret)
+				goto out;
+
+			if (pblk != 0)
+				mark_blocks_free(ctx->fs, pblk, extent.e_len);
+			break;
+		default:
+			break;
+		}
+		tl = get_next_tl(tl);
+	}
+	ext2fs_mark_bb_dirty(ctx->fs);
+	sort_and_merge_extents(&extent_list);
+
+	ret = e2fsck_rewrite_extent_tree(ctx, &extent_list);
+	if (ret)
+		goto out;
+
+	ret = ext2fs_get_mem(inode_len, &inode);
+	if (ret)
+		goto out;
+	ret = ext2fs_read_inode_full(ctx->fs, ino, inode, inode_len);
+	if (ret)
+		goto out;
+
+	if (inode->i_flags & EXT4_INLINE_DATA_FL) {
+		memcpy(inode, fc_hdr + 1, inode_len);
+	} else {
+		memcpy(inode, fc_hdr + 1,
+			offsetof(struct ext2_inode_large, i_block));
+		memcpy(&inode->i_generation,
+		       &((struct ext2_inode_large *)(fc_hdr + 1))->i_generation,
+		       inode_len -
+		       offsetof(struct ext2_inode_large, i_generation));
+	}
+
+	ret = ext2fs_write_inode_full(ctx->fs, ino, inode, inode_len);
+	if (ret)
+		goto out;
+
+	if (ctx->fc_replay_state.fc_num_blks == 0) {
+		ext2fs_mark_super_dirty(ctx->fs);
+		ext2fs_write_block_bitmap(ctx->fs);
+		ext2fs_write_inode_bitmap(ctx->fs);
+		ext2fs_calculate_summary_stats(ctx->fs);
+		ext2fs_set_gdt_csum(ctx->fs);
+		ext2fs_flush(ctx->fs);
+	}
+out:
+	ext2fs_free_mem(&extent_list.extents);
+	return ret;
+}
+
 static errcode_t e2fsck_get_journal(e2fsck_t ctx, journal_t **ret_journal)
 {
 	struct process_block_struct pb;
@@ -514,6 +993,10 @@  static errcode_t e2fsck_get_journal(e2fsck_t ctx, journal_t **ret_journal)
 
 	journal->j_sb_buffer = bh;
 	journal->j_superblock = (journal_superblock_t *)bh->b_data;
+	if (ext2fs_has_feature_fast_commit(ctx->fs->super))
+		journal->j_fc_replay_callback = ext4_journal_fc_replay_cb;
+	else
+		journal->j_fc_replay_callback = NULL;
 
 #ifdef USE_INODE_IO
 	if (j_inode)
@@ -688,7 +1171,13 @@  static errcode_t e2fsck_journal_load(journal_t *journal)
 	journal->j_transaction_sequence = journal->j_tail_sequence;
 	journal->j_tail = ntohl(jsb->s_start);
 	journal->j_first = ntohl(jsb->s_first);
-	journal->j_last = ntohl(jsb->s_maxlen);
+	if (jbd2_has_feature_fast_commit(journal)) {
+		journal->j_last_fc = ntohl(jsb->s_maxlen);
+		journal->j_last = journal->j_last_fc - JBD2_FAST_COMMIT_BLOCKS;
+		journal->j_first_fc = journal->j_last + 1;
+	} else {
+		journal->j_last = ntohl(jsb->s_maxlen);
+	}
 
 	return 0;
 }
diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h
index 6c20ea77..410db16a 100644
--- a/lib/ext2fs/ext2_fs.h
+++ b/lib/ext2fs/ext2_fs.h
@@ -490,6 +490,52 @@  struct ext2_inode_large {
 /*9c*/	__u32   i_projid;       /* Project ID */
 };
 
+/* Fast commit stuff */
+/* Ext4 fast commit related info */
+
+/* Magic of fast commit header */
+#define EXT4_FC_MAGIC			0xE2540090
+
+struct ext4_fc_commit_hdr {
+	/* Fast commit magic, should be EXT4_FC_MAGIC */
+	__u32 fc_magic;
+	/* Features used by this fast commit block */
+	__u8 fc_features;
+	/* Number of TLVs in this fast commmit block */
+	__u16 fc_num_tlvs;
+	/* Inode number */
+	__u32 fc_ino;
+	/* Csum(hdr+contents) */
+	__u32 fc_csum;
+};
+
+struct ext4_fc_lrange {
+	__le32 fc_lblk;
+	__le32 fc_len;
+};
+
+#define EXT4_FC_TAG_ADD_RANGE		0x1
+#define EXT4_FC_TAG_DEL_RANGE		0x2
+#define EXT4_FC_TAG_CREAT_DENTRY	0x3
+#define EXT4_FC_TAG_ADD_DENTRY		0x4
+#define EXT4_FC_TAG_DEL_DENTRY		0x5
+
+struct ext4_fc_tl {
+	__le16 fc_tag;
+	__le16 fc_len;
+};
+
+/* On disk fast commit tlv value structure for dirent tags:
+ *  - EXT4_FC_TAG_CREATE_DENTRY
+ *  - EXT4_FC_TAG_ADD_DENTRY
+ *  - EXT4_FC_TAG_DEL_DENTRY
+ */
+struct ext4_fc_dentry_info {
+	__le32 fc_parent_ino;
+	__le32 fc_ino;
+	__u8 fc_dname[0];
+};
+
 #define EXT4_INODE_CSUM_HI_EXTRA_END	\
 	(offsetof(struct ext2_inode_large, i_checksum_hi) + sizeof(__u16) - \
 	 EXT2_GOOD_OLD_INODE_SIZE)