[RFC,3/5] ext4: adjust reserved cluster count when removing extents

Message ID 20180513175624.12887-4-enwlinux@gmail.com
State New
Headers show
Series
  • ext4: rework delayed allocated cluster accounting
Related show

Commit Message

Eric Whitney May 13, 2018, 5:56 p.m.
After applying the previous two patches in this series, it's not
uncommon to see kernel error messages reporting a shortage of cluster
reservations when extents are mapped after files have been truncated
or punched.  This can lead to data loss because a cluster may not
be available for allocation at the time a delayed extent must be
mapped.  The overcounting addressed by the previous two patches
appears to have masked problems with the code in
ext4_da_page_release_reservation() that reduces the reserved cluster
count on truncation, hole punching, etc.

For example, the reserved cluster count may actually need to be
incremented when a page is invalidated if that page was the last
page in an allocated cluster shared with a delayed extent.  This
situation could arise if a write to a portion of a cluster was
mapped prior to a delayed write to another portion of the same
cluster.  No reservation would be made on the second write, since the
cluster was allocated, but liability for that write would remain if
the allocated cluster was freed before the second write was mapped.

It's also not clear that it's checking the proper blocks for delayed
allocation status when reducing the reserved cluster count.  This
should only be done for those pages whose buffer delay bit has been
set.

This patch and the following two patches address these problems.

Modify ext4_ext_remove_space() and the code it calls to correct the
reserved cluster count for delayed allocated clusters shared with
allocated blocks when a block range is removed from the extent tree.
A shared cluster (referred to as a partial cluster in the code) can
occur at the ends of a written or unwritten extent when the starting
or ending cluster is not aligned on a starting or ending cluster
boundary, respectively.  The reserved cluster count is incremented if
the portion of a partial cluster not shared by the block range to be
removed is shared with at least one delayed extent but not shared with
any other written or unwritten extent.  This reflects the fact that
the partial delayed cluster requires a new reservation to reserve
space now that the allocated partial cluster has been freed.

Add a new function, ext4_rereserve_cluster(), to reapply a reservation
on a delayed allocated cluster sharing blocks with a freed allocated
cluster.  To avoid ENOSPC on reservation, a flag is applied to
ext4_free_blocks() to briefly defer updating the freeclusters counter
when an allocated cluster is freed.  This prevents another thread
from allocating the freed block before a check can be made to
determine whether a reservation should be reapplied;
ext4_rereserve_cluster() performs the check and the deferred
freecluster counter update.

Signed-off-by: Eric Whitney <enwlinux@gmail.com>
---
 fs/ext4/ext4.h    |   1 +
 fs/ext4/extents.c | 309 +++++++++++++++++++++++++++++++++++-------------------
 fs/ext4/mballoc.c |  11 +-
 3 files changed, 209 insertions(+), 112 deletions(-)

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6ee2fded64bf..d16064104cd2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -617,6 +617,7 @@  enum {
 #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE		0x0008
 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER	0x0010
 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER	0x0020
+#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER      0x0040
 
 /*
  * ioctl commands
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d3e4e482a475..66e0df0860b6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -45,6 +45,13 @@ 
 #define EXT4_EXT_DATA_VALID1	0x8  /* first half contains valid data */
 #define EXT4_EXT_DATA_VALID2	0x10 /* second half contains valid data */
 
+struct partial_cluster {
+	ext4_fsblk_t pclu;
+	ext4_lblk_t lblk;
+	enum {initial, tofree, nofree} state;
+	bool fragment;
+};
+
 static __le32 ext4_extent_block_csum(struct inode *inode,
 				     struct ext4_extent_header *eh)
 {
@@ -2484,102 +2491,172 @@  static inline int get_default_free_blocks_flags(struct inode *inode)
 	return 0;
 }
 
+/*
+ * Used when freeing a delayed partial cluster containing @lblk when
+ * the RERESERVE_CLUSTER flag has been applied to ext4_free_blocks().
+ */
+static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	/*
+	 * rereserve if cluster is being freed and if any block in the
+	 * cluster is delayed - if so, the reserved count was decremented
+	 * for the allocated cluster we're freeing
+	 */
+	if (ext4_find_delalloc_cluster(inode, lblk)) {
+		spin_lock(&ei->i_block_reservation_lock);
+		ei->i_reserved_data_blocks++;
+		percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
+		spin_unlock(&ei->i_block_reservation_lock);
+	}
+
+	/*
+	 * the free_clusters counter is always corrected to reflect
+	 * ext4_free_blocks() freeing the cluster - it's just being
+	 * deferred until we can determine whether the dirtyclusters_counter
+	 * should be incremented.
+	 */
+	percpu_counter_add(&sbi->s_freeclusters_counter, 1);
+}
+
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 			      struct ext4_extent *ex,
-			      long long *partial_cluster,
+			      struct partial_cluster *partial,
 			      ext4_lblk_t from, ext4_lblk_t to)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	unsigned short ee_len = ext4_ext_get_actual_len(ex);
-	ext4_fsblk_t pblk;
-	int flags = get_default_free_blocks_flags(inode);
+	ext4_fsblk_t last_pblk, pblk;
+	ext4_lblk_t num;
+	int flags;
+
+	/* only extent tail removal is allowed */
+	if (from < le32_to_cpu(ex->ee_block) ||
+	    to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
+		ext4_error(sbi->s_sb, "strange request: removal(2) "
+			   "%u-%u from %u:%u",
+			   from, to, le32_to_cpu(ex->ee_block), ee_len);
+		return 0;
+	}
+
+#ifdef EXTENTS_STATS
+	spin_lock(&sbi->s_ext_stats_lock);
+	sbi->s_ext_blocks += ee_len;
+	sbi->s_ext_extents++;
+	if (ee_len < sbi->s_ext_min)
+		sbi->s_ext_min = ee_len;
+	if (ee_len > sbi->s_ext_max)
+		sbi->s_ext_max = ee_len;
+	if (ext_depth(inode) > sbi->s_depth_max)
+		sbi->s_depth_max = ext_depth(inode);
+	spin_unlock(&sbi->s_ext_stats_lock);
+#endif
+
+	/* temporary - fix partial reporting */
+	trace_ext4_remove_blocks(inode, ex, from, to,
+				 (long long) partial->pclu);
+
+	/*
+	 * if we have a partial cluster, and it's different from the
+	 * cluster of the last block in the extent, we free it
+	 */
+	last_pblk = ext4_ext_pblock(ex) + ee_len - 1;
+
+	if (partial->state != initial &&
+	    partial->pclu != EXT4_B2C(sbi, last_pblk)) {
+		if (partial->state == tofree) {
+			flags = get_default_free_blocks_flags(inode);
+			if (test_opt(inode->i_sb, DELALLOC))
+				flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
+			ext4_free_blocks(handle, inode, NULL,
+					EXT4_C2B(sbi, partial->pclu),
+					sbi->s_cluster_ratio, flags);
+			if (test_opt(inode->i_sb, DELALLOC))
+				ext4_rereserve_cluster(inode, partial->lblk);
+		}
+		partial->state = initial;
+	}
+
+	num = le32_to_cpu(ex->ee_block) + ee_len - from;
+	pblk = ext4_ext_pblock(ex) + ee_len - num;
+
+	/*
+	 * We free the partial cluster at the end of the extent (if any),
+	 * unless the cluster is used by another extent (partial_cluster
+	 * state is nofree).  If a partial cluster exists here, it must be
+	 * shared with the last block in the extent.
+	 */
+	flags = get_default_free_blocks_flags(inode);
+
+	if (partial->state == nofree) {
+		flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
+	} else {
+		/* partial, left end cluster aligned, right end unaligned */
+		if (test_opt(inode->i_sb, DELALLOC) &&
+		    EXT4_LBLK_CMASK(sbi, to) >= from &&
+		    EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) {
+			if (partial->state == initial ||
+			    (partial->state == tofree && partial->fragment) ||
+			    (partial->state == tofree && to + 1 !=
+				    partial->lblk)) {
+				flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
+				ext4_free_blocks(handle, inode, NULL,
+						EXT4_PBLK_CMASK(sbi, last_pblk),
+						sbi->s_cluster_ratio, flags);
+				ext4_rereserve_cluster(inode, to);
+				partial->state = initial;
+				flags = get_default_free_blocks_flags(inode);
+				flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
+			}
+		}
+	}
 
 	/*
 	 * For bigalloc file systems, we never free a partial cluster
-	 * at the beginning of the extent.  Instead, we make a note
-	 * that we tried freeing the cluster, and check to see if we
+	 * at the beginning of the extent.  Instead, we check to see if we
 	 * need to free it on a subsequent call to ext4_remove_blocks,
 	 * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
 	 */
+
 	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
+	ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
+
+	/* reset the partial cluster if we've freed past it */
+	if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
+		partial->state = initial;
 
-	trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
 	/*
-	 * If we have a partial cluster, and it's different from the
-	 * cluster of the last block, we need to explicitly free the
-	 * partial cluster here.
+	 * If we've freed the entire extent but the beginning is not left
+	 * cluster aligned and is not marked as ineligible for freeing record
+	 * the partial cluster at the beginning of the extent.  It wasn't freed
+	 * by the preceding ext4_free_blocks() call, and we need to look
+	 * farther to the left to determine if it's to be freed (not shared
+	 * with another extent). Else, reset the partial cluster - we're either
+	 * done freeing or the beginning of the extent is left cluster aligned.
 	 */
-	pblk = ext4_ext_pblock(ex) + ee_len - 1;
-	if (*partial_cluster > 0 &&
-	    *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
-		ext4_free_blocks(handle, inode, NULL,
-				 EXT4_C2B(sbi, *partial_cluster),
-				 sbi->s_cluster_ratio, flags);
-		*partial_cluster = 0;
-	}
-
-#ifdef EXTENTS_STATS
-	{
-		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-		spin_lock(&sbi->s_ext_stats_lock);
-		sbi->s_ext_blocks += ee_len;
-		sbi->s_ext_extents++;
-		if (ee_len < sbi->s_ext_min)
-			sbi->s_ext_min = ee_len;
-		if (ee_len > sbi->s_ext_max)
-			sbi->s_ext_max = ee_len;
-		if (ext_depth(inode) > sbi->s_depth_max)
-			sbi->s_depth_max = ext_depth(inode);
-		spin_unlock(&sbi->s_ext_stats_lock);
+	if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
+		if (partial->state == initial) {
+			partial->pclu = EXT4_B2C(sbi, pblk);
+			partial->lblk = from;
+			partial->fragment =
+				(bool) (EXT4_LBLK_CMASK(sbi, to) < from);
+			partial->state = tofree;
+		}
+		if (partial->state == tofree) {
+			/*
+			 * Look for gap between right edge and outstanding
+			 * partial cluster.  Left edge handled in case above.
+			 */
+			if (!partial->fragment && to + 1 != partial->lblk)
+				partial->fragment = true;
+			partial->lblk = from;
+		}
+	} else {
+		partial->state = initial;
 	}
-#endif
-	if (from >= le32_to_cpu(ex->ee_block)
-	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
-		/* tail removal */
-		ext4_lblk_t num;
-		long long first_cluster;
-
-		num = le32_to_cpu(ex->ee_block) + ee_len - from;
-		pblk = ext4_ext_pblock(ex) + ee_len - num;
-		/*
-		 * Usually we want to free partial cluster at the end of the
-		 * extent, except for the situation when the cluster is still
-		 * used by any other extent (partial_cluster is negative).
-		 */
-		if (*partial_cluster < 0 &&
-		    *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1))
-			flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
 
-		ext_debug("free last %u blocks starting %llu partial %lld\n",
-			  num, pblk, *partial_cluster);
-		ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
-		/*
-		 * If the block range to be freed didn't start at the
-		 * beginning of a cluster, and we removed the entire
-		 * extent and the cluster is not used by any other extent,
-		 * save the partial cluster here, since we might need to
-		 * delete if we determine that the truncate or punch hole
-		 * operation has removed all of the blocks in the cluster.
-		 * If that cluster is used by another extent, preserve its
-		 * negative value so it isn't freed later on.
-		 *
-		 * If the whole extent wasn't freed, we've reached the
-		 * start of the truncated/punched region and have finished
-		 * removing blocks.  If there's a partial cluster here it's
-		 * shared with the remainder of the extent and is no longer
-		 * a candidate for removal.
-		 */
-		if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) {
-			first_cluster = (long long) EXT4_B2C(sbi, pblk);
-			if (first_cluster != -*partial_cluster)
-				*partial_cluster = first_cluster;
-		} else {
-			*partial_cluster = 0;
-		}
-	} else
-		ext4_error(sbi->s_sb, "strange request: removal(2) "
-			   "%u-%u from %u:%u",
-			   from, to, le32_to_cpu(ex->ee_block), ee_len);
 	return 0;
 }
 
@@ -2602,7 +2679,7 @@  static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		 struct ext4_ext_path *path,
-		 long long *partial_cluster,
+		 struct partial_cluster *partial,
 		 ext4_lblk_t start, ext4_lblk_t end)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2634,7 +2711,8 @@  ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	ex_ee_block = le32_to_cpu(ex->ee_block);
 	ex_ee_len = ext4_ext_get_actual_len(ex);
 
-	trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
+	/* temporary - fix partial reporting */
+	trace_ext4_ext_rm_leaf(inode, start, ex, (long long) partial->pclu);
 
 	while (ex >= EXT_FIRST_EXTENT(eh) &&
 			ex_ee_block + ex_ee_len > start) {
@@ -2665,8 +2743,8 @@  ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 			 */
 			if (sbi->s_cluster_ratio > 1) {
 				pblk = ext4_ext_pblock(ex);
-				*partial_cluster =
-					-(long long) EXT4_B2C(sbi, pblk);
+				partial->pclu = EXT4_B2C(sbi, pblk);
+				partial->state = nofree;
 			}
 			ex--;
 			ex_ee_block = le32_to_cpu(ex->ee_block);
@@ -2708,8 +2786,7 @@  ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		if (err)
 			goto out;
 
-		err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
-					 a, b);
+		err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
 		if (err)
 			goto out;
 
@@ -2763,18 +2840,23 @@  ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	 * If there's a partial cluster and at least one extent remains in
 	 * the leaf, free the partial cluster if it isn't shared with the
 	 * current extent.  If it is shared with the current extent
-	 * we zero partial_cluster because we've reached the start of the
+	 * we reset the partial cluster because we've reached the start of the
 	 * truncated/punched region and we're done removing blocks.
 	 */
-	if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) {
+	if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
 		pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
-		if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
+		if (partial->pclu != EXT4_B2C(sbi, pblk)) {
+			int flags = get_default_free_blocks_flags(inode);
+
+			if (test_opt(inode->i_sb, DELALLOC))
+				flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
 			ext4_free_blocks(handle, inode, NULL,
-					 EXT4_C2B(sbi, *partial_cluster),
-					 sbi->s_cluster_ratio,
-					 get_default_free_blocks_flags(inode));
+					 EXT4_C2B(sbi, partial->pclu),
+					 sbi->s_cluster_ratio, flags);
+			if (test_opt(inode->i_sb, DELALLOC))
+				ext4_rereserve_cluster(inode, partial->lblk);
 		}
-		*partial_cluster = 0;
+		partial->state = initial;
 	}
 
 	/* if this leaf is free, then we should
@@ -2813,10 +2895,15 @@  int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	int depth = ext_depth(inode);
 	struct ext4_ext_path *path = NULL;
-	long long partial_cluster = 0;
+	struct partial_cluster partial;
 	handle_t *handle;
 	int i = 0, err = 0;
 
+	partial.pclu = 0;
+	partial.lblk = 0;
+	partial.state = initial;
+	partial.fragment = false;
+
 	ext_debug("truncate since %u to %u\n", start, end);
 
 	/* probably first extent we're gonna free will be last in block */
@@ -2876,8 +2963,8 @@  int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 			 */
 			if (sbi->s_cluster_ratio > 1) {
 				pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
-				partial_cluster =
-					-(long long) EXT4_B2C(sbi, pblk);
+				partial.pclu = EXT4_B2C(sbi, pblk);
+				partial.state = nofree;
 			}
 
 			/*
@@ -2905,9 +2992,10 @@  int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 						    &ex);
 			if (err)
 				goto out;
-			if (pblk)
-				partial_cluster =
-					-(long long) EXT4_B2C(sbi, pblk);
+			if (pblk) {
+				partial.pclu = EXT4_B2C(sbi, pblk);
+				partial.state = nofree;
+			}
 		}
 	}
 	/*
@@ -2942,8 +3030,7 @@  int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 		if (i == depth) {
 			/* this is leaf block */
 			err = ext4_ext_rm_leaf(handle, inode, path,
-					       &partial_cluster, start,
-					       end);
+					       &partial, start, end);
 			/* root level has p_bh == NULL, brelse() eats this */
 			brelse(path[i].p_bh);
 			path[i].p_bh = NULL;
@@ -3015,21 +3102,25 @@  int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 		}
 	}
 
+	/* temporary - fix partial reporting */
 	trace_ext4_ext_remove_space_done(inode, start, end, depth,
-			partial_cluster, path->p_hdr->eh_entries);
+			   (long long) partial.pclu, path->p_hdr->eh_entries);
 
 	/*
-	 * If we still have something in the partial cluster and we have removed
-	 * even the first extent, then we should free the blocks in the partial
-	 * cluster as well.  (This code will only run when there are no leaves
-	 * to the immediate left of the truncated/punched region.)
+	 * If there's a partial cluster and we have removed the first extent
+	 * in the file, then we should free the partial cluster as well.
 	 */
-	if (partial_cluster > 0 && err == 0) {
-		/* don't zero partial_cluster since it's not used afterwards */
+	if (partial.state == tofree && err == 0) {
+		int flags = get_default_free_blocks_flags(inode);
+
+		if (test_opt(inode->i_sb, DELALLOC))
+			flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
 		ext4_free_blocks(handle, inode, NULL,
-				 EXT4_C2B(sbi, partial_cluster),
-				 sbi->s_cluster_ratio,
-				 get_default_free_blocks_flags(inode));
+				 EXT4_C2B(sbi, partial.pclu),
+				 sbi->s_cluster_ratio, flags);
+		if (test_opt(inode->i_sb, DELALLOC))
+			ext4_rereserve_cluster(inode, partial.lblk);
+		partial.state = initial;
 	}
 
 	/* TODO: flexible tree reduction should be here */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 769a62708b1c..0f0b75a2ce60 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4934,9 +4934,14 @@  void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			     &sbi->s_flex_groups[flex_group].free_clusters);
 	}
 
-	if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
-		dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
-	percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
+	if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) {
+		dquot_reclaim_block(inode, EXT4_C2B(sbi, count_clusters));
+	} else {
+		if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+			dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
+		percpu_counter_add(&sbi->s_freeclusters_counter,
+				   count_clusters);
+	}
 
 	ext4_mb_unload_buddy(&e4b);