[2/2] Ext4: bigalloc: do not reserve space for delalloc extents if there is an available cluster

Message ID 1525702692-65658-3-git-send-email-bo.liu@linux.alibaba.com
State New
Headers show
Series
  • bigalloc + delalloc reservation leak fixes
Related show

Commit Message

Liu Bo May 7, 2018, 2:18 p.m.
Here is another overreservation which results in premature ENOSPC when
using bigalloc and delalloc,

case 1:
$ xfs_io -f -c "pwrite 0 2" -c "fsync" -c "pwrite 16K 2" -c "fsync" /mnt/dir/foo
or
case 2:
$ xfs_io -f -c "falloc 0 2" -c "pwrite 16K 2" -c "fsync" /mnt/dir/foo

Currently 'du' reports 128K instead of 64K because of overreservation.

What was happending is, take case 1 as an example,

a) in ext4_da_write_begin(), extent [0,2) will reserve one cluster for
itself,

b) forced by fsync, extent [0,2) allocates a 64K cluster and uses the
reserved cluster,

c) in ext4_da_write_begin(), extent [16k,16k+2) will reserve one cluster
for itself,

d) forced by fsync, extent [16k, 16k+2) finds out that it can use the new
'allocated' block, i.e. @map_from_cluster is true, such that it doesn't
update the reservation accounting, and no one will.

Currently ext4_da_write_begin() only checks if there is a delalloc extent
which has reserved a cluster, but a written extent also implies an
available cluster.

With this, ext4_da_write_begin() now checks delalloc, written and
unwritten extents.

Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
---
 fs/ext4/ext4.h           |  3 ++-
 fs/ext4/extents.c        | 24 ++++++++++++++++++++++--
 fs/ext4/extents_status.c | 49 ++++++++++++++++++++++++++++++++----------------
 fs/ext4/extents_status.h | 12 ++++++++++++
 fs/ext4/inode.c          |  5 +++--
 5 files changed, 72 insertions(+), 21 deletions(-)

Comments

kbuild test robot May 7, 2018, 8:47 p.m. | #1
Hi Liu,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on ext4/dev]
[also build test WARNING on v4.17-rc4 next-20180507]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Liu-Bo/Ext4-bigalloc-fix-overreservation-on-quota-accounting/20180508-014241
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev
reproduce:
        # apt-get install sparse
        make ARCH=x86_64 allmodconfig
        make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)

>> fs/ext4/extents_status.c:236:6: sparse: symbol '__ext4_es_find_range_type' was not declared. Should it be static?

Please review and possibly fold the followup patch.

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a42e71203e53..b07d30b1c635 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3121,7 +3121,8 @@  extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
 extern int ext4_find_delalloc_range(struct inode *inode,
 				    ext4_lblk_t lblk_start,
 				    ext4_lblk_t lblk_end);
-extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
+extern int ext4_find_delalloc_or_allocated_cluster(struct inode *inode,
+						   ext4_lblk_t lblk);
 extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 1b3b24a53e1f..acc8b5c213b1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3836,14 +3836,34 @@  int ext4_find_delalloc_range(struct inode *inode,
 		return 0;
 }
 
-int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
+static int ext4_find_own_rsv_range(struct inode *inode,
+					       ext4_lblk_t lblk_start,
+					       ext4_lblk_t lblk_end)
+{
+	struct extent_status es;
+
+	ext4_es_find_own_rsv_extent_range(inode, lblk_start,
+						     lblk_end, &es);
+	if (es.es_len == 0)
+		return 0; /* there is no delay extent in this tree */
+	else if (es.es_lblk <= lblk_start &&
+		 lblk_start < es.es_lblk + es.es_len)
+		return 1;
+	else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
+		return 1;
+	else
+		return 0;
+}
+
+int ext4_find_delalloc_or_allocated_cluster(struct inode *inode,
+					    ext4_lblk_t lblk)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	ext4_lblk_t lblk_start, lblk_end;
 	lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
 	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
 
-	return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
+	return ext4_find_own_rsv_range(inode, lblk_start, lblk_end);
 }
 
 /**
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 763ef185dd17..c828c04c947d 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -233,18 +233,10 @@  static struct extent_status *__es_tree_search(struct rb_root *root,
 	return NULL;
 }
 
-/*
- * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
- * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
- *
- * @inode: the inode which owns delayed extents
- * @lblk: the offset where we start to search
- * @end: the offset where we stop to search
- * @es: delayed extent that we found
- */
-void ext4_es_find_delayed_extent_range(struct inode *inode,
-				 ext4_lblk_t lblk, ext4_lblk_t end,
-				 struct extent_status *es)
+void __ext4_es_find_range_type(struct inode *inode,
+			       ext4_lblk_t lblk, ext4_lblk_t end,
+			       struct extent_status *es,
+			       ext4_es_istype_t *ext4_es_istype)
 {
 	struct ext4_es_tree *tree = NULL;
 	struct extent_status *es1 = NULL;
@@ -252,7 +244,6 @@  void ext4_es_find_delayed_extent_range(struct inode *inode,
 
 	BUG_ON(es == NULL);
 	BUG_ON(end < lblk);
-	trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
 
 	read_lock(&EXT4_I(inode)->i_es_lock);
 	tree = &EXT4_I(inode)->i_es_tree;
@@ -272,19 +263,19 @@  void ext4_es_find_delayed_extent_range(struct inode *inode,
 	es1 = __es_tree_search(&tree->root, lblk);
 
 out:
-	if (es1 && !ext4_es_is_delayed(es1)) {
+	if (es1 && !ext4_es_istype(es1)) {
 		while ((node = rb_next(&es1->rb_node)) != NULL) {
 			es1 = rb_entry(node, struct extent_status, rb_node);
 			if (es1->es_lblk > end) {
 				es1 = NULL;
 				break;
 			}
-			if (ext4_es_is_delayed(es1))
+			if (ext4_es_istype(es1))
 				break;
 		}
 	}
 
-	if (es1 && ext4_es_is_delayed(es1)) {
+	if (es1 && ext4_es_istype(es1)) {
 		tree->cache_es = es1;
 		es->es_lblk = es1->es_lblk;
 		es->es_len = es1->es_len;
@@ -292,10 +283,36 @@  void ext4_es_find_delayed_extent_range(struct inode *inode,
 	}
 
 	read_unlock(&EXT4_I(inode)->i_es_lock);
+}
+
+/*
+ * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
+ * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
+ *
+ * @inode: the inode which owns delayed extents
+ * @lblk: the offset where we start to search
+ * @end: the offset where we stop to search
+ * @es: delayed extent that we found
+ */
+void ext4_es_find_delayed_extent_range(struct inode *inode,
+				       ext4_lblk_t lblk, ext4_lblk_t end,
+				       struct extent_status *es)
+{
+	trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
+
+	__ext4_es_find_range_type(inode, lblk, end, es, ext4_es_is_delayed);
 
 	trace_ext4_es_find_delayed_extent_range_exit(inode, es);
 }
 
+void ext4_es_find_own_rsv_extent_range(struct inode *inode,
+				       ext4_lblk_t lblk, ext4_lblk_t end,
+				       struct extent_status *es)
+{
+	__ext4_es_find_range_type(inode, lblk, end, es,
+				  ext4_es_is_own_rsv);
+}
+
 static void ext4_es_list_add(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 8efdeb903d6b..90ce7fd1cfe7 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -93,9 +93,14 @@  extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 extern void ext4_es_find_delayed_extent_range(struct inode *inode,
 					ext4_lblk_t lblk, ext4_lblk_t end,
 					struct extent_status *es);
+extern void ext4_es_find_own_rsv_extent_range(struct inode *inode,
+					ext4_lblk_t lblk, ext4_lblk_t end,
+					struct extent_status *es);
 extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
 				 struct extent_status *es);
 
+typedef int (ext4_es_istype_t)(struct extent_status *es);
+
 static inline unsigned int ext4_es_status(struct extent_status *es)
 {
 	return es->es_pblk >> ES_SHIFT;
@@ -121,6 +126,13 @@  static inline int ext4_es_is_delayed(struct extent_status *es)
 	return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
 }
 
+static inline int ext4_es_is_own_rsv(struct extent_status *es)
+{
+	return ext4_es_is_delayed(es) ||
+		ext4_es_is_written(es) ||
+		ext4_es_is_unwritten(es);
+}
+
 static inline int ext4_es_is_hole(struct extent_status *es)
 {
 	return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1e50c5efae67..86f9545fd941 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1679,7 +1679,7 @@  static void ext4_da_page_release_reservation(struct page *page,
 		lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) +
 			((num_clusters - 1) << sbi->s_cluster_bits);
 		if (sbi->s_cluster_ratio == 1 ||
-		    !ext4_find_delalloc_cluster(inode, lblk))
+		    !ext4_find_delalloc_or_allocated_cluster(inode, lblk))
 			ext4_da_release_space(inode, 1);
 
 		num_clusters--;
@@ -1867,7 +1867,8 @@  static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 		 * to reserve metadata for every block we're going to write.
 		 */
 		if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 ||
-		    !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
+		    !ext4_find_delalloc_or_allocated_cluster(inode,
+							     map->m_lblk)) {
 			ret = ext4_da_reserve_space(inode);
 			if (ret) {
 				/* not enough space to reserve */