[RFC,1/5] ext4: fix reserved cluster accounting at delayed write time

Message ID 20180513175624.12887-2-enwlinux@gmail.com
State New
Headers show
Series
  • ext4: rework delayed allocated cluster accounting
Related show

Commit Message

Eric Whitney May 13, 2018, 5:56 p.m.
The code in ext4_da_map_blocks sometimes reserves space for more
delayed allocated clusters than it should, resulting in premature
ENOSPC, exceeded quota, and inaccurate free space reporting.

It fails to check the extents status tree for written and unwritten
clusters which have already been allocated and which share a cluster
with the block being delayed allocated.  In addition, it fails to
continue on and search the extent tree when no delayed allocated or
allocated clusters have been found in the extents status tree.  Written
extents may be purged from the extents status tree under memory pressure.
Only delayed and unwritten delayed extents are guaranteed to be retained.

Signed-off-by: Eric Whitney <enwlinux@gmail.com>
---
 fs/ext4/ext4.h           |   4 ++
 fs/ext4/extents.c        | 114 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/extents_status.c |  64 ++++++++++++++++++++++++++
 fs/ext4/extents_status.h |  11 +++++
 fs/ext4/inode.c          |  30 ++++++++++---
 5 files changed, 218 insertions(+), 5 deletions(-)

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a42e71203e53..6ee2fded64bf 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3122,6 +3122,9 @@  extern int ext4_find_delalloc_range(struct inode *inode,
 				    ext4_lblk_t lblk_start,
 				    ext4_lblk_t lblk_end);
 extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
+extern int ext4_find_cluster(struct inode *inode,
+			     int (*match_fn)(struct extent_status *es),
+			     ext4_lblk_t lblk);
 extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
@@ -3132,6 +3135,7 @@  extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
 				struct inode *inode2, ext4_lblk_t lblk1,
 			     ext4_lblk_t lblk2,  ext4_lblk_t count,
 			     int mark_unwritten,int *err);
+extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
 
 /* move_extent.c */
 extern void ext4_double_down_write_data_sem(struct inode *first,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c969275ce3ee..872782ba8bc3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3846,6 +3846,47 @@  int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
 	return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
 }
 
+/*
+ * If the block range specified by @start and @end contains an es extent
+ * matched by the matching function specified by @match_fn, return 1.  Else,
+ * return 0.
+ */
+int ext4_find_range(struct inode *inode,
+		    int (*match_fn)(struct extent_status *es),
+		    ext4_lblk_t start,
+		    ext4_lblk_t end)
+{
+	struct extent_status es;
+
+	ext4_es_find_extent_range(inode, match_fn, start, end, &es);
+	if (es.es_len == 0)
+		return 0; /* there is no matching extent in this tree */
+	else if (es.es_lblk <= start &&
+		 start < es.es_lblk + es.es_len)
+		return 1;
+	else if (start <= es.es_lblk && es.es_lblk <= end)
+		return 1;
+	else
+		return 0;
+}
+
+/*
+ * If the cluster containing @lblk is contained in an es extent matched by
+ * the matching function specified by @match_fn, return 1.  Else, return 0.
+ */
+int ext4_find_cluster(struct inode *inode,
+		      int (*match_fn)(struct extent_status *es),
+		      ext4_lblk_t lblk)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	ext4_lblk_t lblk_start, lblk_end;
+
+	lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
+	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
+
+	return ext4_find_range(inode, match_fn, lblk_start, lblk_end);
+}
+
 /**
  * Determines how many complete clusters (out of those specified by the 'map')
  * are under delalloc and were reserved quota for.
@@ -5935,3 +5976,76 @@  ext4_swap_extents(handle_t *handle, struct inode *inode1,
 	}
 	return replaced_count;
 }
+
+/*
+ *  Returns true if any block in the cluster containing lblk is mapped,
+ *  else returns false.  Can also return errors.
+ *
+ *  Derived from ext4_ext_map_blocks().
+ */
+int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_ext_path *path;
+	int depth, mapped = 0, err = 0;
+	struct ext4_extent *extent;
+	ext4_lblk_t first_lblk, first_lclu, last_lclu;
+
+	/* search for the extent closest to the first block in the cluster */
+	path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
+	if (IS_ERR(path)) {
+		err = PTR_ERR(path);
+		path = NULL;
+		goto out;
+	}
+
+	depth = ext_depth(inode);
+
+	/*
+	 * A consistent leaf must not be empty.  This situation is possible,
+	 * though, _during_ tree modification, and it's why an assert can't
+	 * be put in ext4_find_extent().
+	 */
+	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
+		EXT4_ERROR_INODE(inode, "bad extent address "
+				 "lblock: %lu, depth: %d, pblock %lld",
+				 (unsigned long) EXT4_C2B(sbi, lclu),
+				 depth, path[depth].p_block);
+		err = -EFSCORRUPTED;
+		goto out;
+	}
+
+	extent = path[depth].p_ext;
+
+	/* can't be mapped if the extent tree is empty */
+	if (extent == NULL)
+		goto out;
+
+	first_lblk = le32_to_cpu(extent->ee_block);
+	first_lclu = EXT4_B2C(sbi, first_lblk);
+
+	/*
+	 * Three possible outcomes at this point - found extent spanning
+	 * the target cluster, to the left of the target cluster, or to the
+	 * right of the target cluster.  The first two cases are handled here.
+	 * The last case indicates the target cluster is not mapped.
+	 */
+	if (lclu >= first_lclu) {
+		last_lclu = EXT4_B2C(sbi, first_lblk +
+				     ext4_ext_get_actual_len(extent) - 1);
+		if (lclu <= last_lclu) {
+			mapped = 1;
+		} else {
+			first_lblk = ext4_ext_next_allocated_block(path);
+			first_lclu = EXT4_B2C(sbi, first_lblk);
+			if (lclu == first_lclu)
+				mapped = 1;
+		}
+	}
+
+out:
+	ext4_ext_drop_refs(path);
+	kfree(path);
+
+	return err ? err : mapped;
+}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 763ef185dd17..0c395b5a57a2 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -296,6 +296,70 @@  void ext4_es_find_delayed_extent_range(struct inode *inode,
 	trace_ext4_es_find_delayed_extent_range_exit(inode, es);
 }
 
+/*
+ * Find the first es extent in the block range specified by @lblk and @end
+ * that satisfies the matching function specified by @match_fn.  If a
+ * match is found, it's returned in @es.  If not, a struct extents_status
+ * is returned in @es whose es_lblk, es_len, and es_pblk components are 0.
+ *
+ * This function is derived from ext4_es_find_delayed_extent_range().
+ * In the future, it could be used to replace it with the use of a suitable
+ * matching function to avoid redundant code.
+ */
+void ext4_es_find_extent_range(struct inode *inode,
+			       int (*match_fn)(struct extent_status *es),
+			       ext4_lblk_t lblk, ext4_lblk_t end,
+			       struct extent_status *es)
+{
+	struct ext4_es_tree *tree = NULL;
+	struct extent_status *es1 = NULL;
+	struct rb_node *node;
+
+	BUG_ON(es == NULL);
+	BUG_ON(end < lblk);
+
+	read_lock(&EXT4_I(inode)->i_es_lock);
+	tree = &EXT4_I(inode)->i_es_tree;
+
+	/* see if the extent has been cached */
+	es->es_lblk = es->es_len = es->es_pblk = 0;
+	if (tree->cache_es) {
+		es1 = tree->cache_es;
+		if (in_range(lblk, es1->es_lblk, es1->es_len)) {
+			es_debug("%u cached by [%u/%u) %llu %x\n",
+				 lblk, es1->es_lblk, es1->es_len,
+				 ext4_es_pblock(es1), ext4_es_status(es1));
+			goto out;
+		}
+	}
+
+	es1 = __es_tree_search(&tree->root, lblk);
+
+out:
+	if (es1 && !match_fn(es1)) {
+		while ((node = rb_next(&es1->rb_node)) != NULL) {
+			es1 = rb_entry(node, struct extent_status, rb_node);
+			if (es1->es_lblk > end) {
+				es1 = NULL;
+				break;
+			}
+			if (match_fn(es1))
+				break;
+		}
+	}
+
+	if (es1 && match_fn(es1)) {
+		tree->cache_es = es1;
+		es->es_lblk = es1->es_lblk;
+		es->es_len = es1->es_len;
+		es->es_pblk = es1->es_pblk;
+	}
+
+	read_unlock(&EXT4_I(inode)->i_es_lock);
+
+}
+
+
 static void ext4_es_list_add(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 8efdeb903d6b..439143f7504f 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -93,6 +93,10 @@  extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 extern void ext4_es_find_delayed_extent_range(struct inode *inode,
 					ext4_lblk_t lblk, ext4_lblk_t end,
 					struct extent_status *es);
+extern void ext4_es_find_extent_range(struct inode *inode,
+				      int (*match_fn)(struct extent_status *es),
+				      ext4_lblk_t lblk, ext4_lblk_t end,
+				      struct extent_status *es);
 extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
 				 struct extent_status *es);
 
@@ -126,6 +130,13 @@  static inline int ext4_es_is_hole(struct extent_status *es)
 	return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
 }
 
+static inline int ext4_es_is_claimed(struct extent_status *es)
+{
+	return (ext4_es_type(es) &
+		(EXTENT_STATUS_WRITTEN | EXTENT_STATUS_UNWRITTEN |
+		 EXTENT_STATUS_DELAYED)) != 0;
+}
+
 static inline void ext4_es_set_referenced(struct extent_status *es)
 {
 	es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1e50c5efae67..8f5235b2c094 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1788,6 +1788,7 @@  static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 			      struct ext4_map_blocks *map,
 			      struct buffer_head *bh)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct extent_status es;
 	int retval;
 	sector_t invalid_block = ~((sector_t) 0xffff);
@@ -1857,17 +1858,36 @@  static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 add_delayed:
 	if (retval == 0) {
 		int ret;
+		bool reserve = true;   /* reserve one cluster */
 		/*
 		 * XXX: __block_prepare_write() unmaps passed block,
 		 * is it OK?
 		 */
 		/*
-		 * If the block was allocated from previously allocated cluster,
-		 * then we don't need to reserve it again. However we still need
-		 * to reserve metadata for every block we're going to write.
+		 * If the cluster containing m_lblk is shared with a delayed,
+		 * written, or unwritten extent in a bigalloc file system,
+		 * it's already been claimed and does not need to be reserved.
+		 * Written extents may have been purged from the extents status
+		 * tree if the system has been under memory pressure, so it's
+		 * necessary to examine the extent tree to confirm the
+		 * reservation.
 		 */
-		if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 ||
-		    !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
+		if (sbi->s_cluster_ratio > 1) {
+			reserve = !ext4_find_cluster(inode,
+						     &ext4_es_is_claimed,
+						     map->m_lblk);
+			if (reserve) {
+				ret = ext4_clu_mapped(inode,
+						EXT4_B2C(sbi, map->m_lblk));
+				if (ret < 0) {
+					retval = ret;
+					goto out_unlock;
+				}
+				reserve = !ret;
+			}
+		}
+
+		if (reserve) {
 			ret = ext4_da_reserve_space(inode);
 			if (ret) {
 				/* not enough space to reserve */