Patchwork [1/2] writeback: fix WB_SYNC_NONE writeback from umount

login
register
mail settings
Submitter Stefan Bader
Date June 2, 2010, 1:09 p.m.
Message ID <1275484147-26044-2-git-send-email-stefan.bader@canonical.com>
Download mbox | patch
Permalink /patch/54370/
State Rejected
Delegated to: Stefan Bader
Headers show

Comments

Stefan Bader - June 2, 2010, 1:09 p.m.
From: Jens Axboe <jens.axboe@oracle.com>

BugLink: http://launchpad.net/bugs/543617

When umount calls sync_filesystem(), we first do a WB_SYNC_NONE
writeback to kick off writeback of pending dirty inodes, then follow
that up with a WB_SYNC_ALL to wait for it. Since umount already holds
the sb s_umount mutex, WB_SYNC_NONE ends up doing nothing and all
writeback happens as WB_SYNC_ALL. This can greatly slow down umount,
since WB_SYNC_ALL writeback is a data integrity operation and thus
a bigger hammer than simple WB_SYNC_NONE. For barrier aware file systems
it's a lot slower.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
(backported from commit e913fc825dc685a444cb4c1d0f9d32f372f59861 upstream)
Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
---
 fs/fs-writeback.c           |   48 +++++++++++++++++++++++++++++++++---------
 fs/sync.c                   |    2 +-
 include/linux/backing-dev.h |    2 +-
 include/linux/writeback.h   |   10 +++++++++
 mm/page-writeback.c         |    2 +-
 5 files changed, 50 insertions(+), 14 deletions(-)

Patch

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index bff5f77..3ec332d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -44,6 +44,7 @@  struct wb_writeback_args {
 	int for_kupdate:1;
 	int range_cyclic:1;
 	int for_background:1;
+	int sb_pinned:1;
 };
 
 /*
@@ -229,6 +230,11 @@  static void bdi_sync_writeback(struct backing_dev_info *bdi,
 		.sync_mode	= WB_SYNC_ALL,
 		.nr_pages	= LONG_MAX,
 		.range_cyclic	= 0,
+		/*
+		 * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
+		 * lets make it explicitly clear.
+		 */
+		.sb_pinned	= 1,
 	};
 	struct bdi_work work;
 
@@ -243,21 +249,23 @@  static void bdi_sync_writeback(struct backing_dev_info *bdi,
  * bdi_start_writeback - start writeback
  * @bdi: the backing device to write from
  * @nr_pages: the number of pages to write
+ * @sb_locked: caller already holds sb umount sem.
  *
  * Description:
  *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
  *   started when this function returns, we make no guarentees on
- *   completion. Caller need not hold sb s_umount semaphore.
+ *   completion. Caller specifies whether sb umount sem is held already or not.
  *
  */
 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-			 long nr_pages)
+			 long nr_pages, int sb_locked)
 {
 	struct wb_writeback_args args = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_NONE,
 		.nr_pages	= nr_pages,
 		.range_cyclic	= 1,
+		.sb_pinned	= sb_locked,
 	};
 
 	/*
@@ -584,7 +592,7 @@  static int pin_sb_for_writeback(struct writeback_control *wbc,
 	/*
 	 * Caller must already hold the ref for this
 	 */
-	if (wbc->sync_mode == WB_SYNC_ALL) {
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
 		WARN_ON(!rwsem_is_locked(&sb->s_umount));
 		return 0;
 	}
@@ -757,6 +765,7 @@  static long wb_writeback(struct bdi_writeback *wb,
 		.older_than_this	= NULL,
 		.for_kupdate		= args->for_kupdate,
 		.range_cyclic		= args->range_cyclic,
+		.sb_pinned		= args->sb_pinned,
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
@@ -1190,6 +1199,18 @@  static void wait_sb_inodes(struct super_block *sb)
 	iput(old_inode);
 }
 
+static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
+{
+	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+	long nr_to_write;
+
+	nr_to_write = nr_dirty + nr_unstable +
+			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
+
+	bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
+}
+
 /**
  * writeback_inodes_sb	-	writeback dirty inodes from given super_block
  * @sb: the superblock
@@ -1201,18 +1222,23 @@  static void wait_sb_inodes(struct super_block *sb)
  */
 void writeback_inodes_sb(struct super_block *sb)
 {
-	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-	long nr_to_write;
-
-	nr_to_write = nr_dirty + nr_unstable +
-			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
-
-	bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
+	__writeback_inodes_sb(sb, 0);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 
 /**
+ * writeback_inodes_sb_locked	- writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Like writeback_inodes_sb(), except the caller already holds the
+ * sb umount sem.
+ */
+void writeback_inodes_sb_locked(struct super_block *sb)
+{
+	__writeback_inodes_sb(sb, 1);
+}
+
+/**
  * writeback_inodes_sb_if_idle	-	start writeback if none underway
  * @sb: the superblock
  *
diff --git a/fs/sync.c b/fs/sync.c
index d104591..8932a3e 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -37,7 +37,7 @@  static int __sync_filesystem(struct super_block *sb, int wait)
 	/* Avoid doing twice syncing and cache pruning for quota sync */
 	if (!wait) {
 		writeout_quota_sb(sb, -1);
-		writeback_inodes_sb(sb);
+		writeback_inodes_sb_locked(sb);
 	} else {
 		sync_quota_sb(sb, -1);
 		sync_inodes_sb(sb);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index b449e73..f257a23 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -102,7 +102,7 @@  int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-				long nr_pages);
+				long nr_pages, int sb_locked);
 int bdi_writeback_task(struct bdi_writeback *wb);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index dc52482..b62f517 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -61,6 +61,15 @@  struct writeback_control {
 	 * so we use a single control to update them
 	 */
 	unsigned no_nrwrite_index_update:1;
+
+	/*
+	 * For WB_SYNC_ALL, the sb must always be pinned. For WB_SYNC_NONE,
+	 * the writeback code will pin the sb for the caller. However,
+	 * for eg umount, the caller does WB_SYNC_NONE but already has
+	 * the sb pinned. If the below is set, caller already has the
+	 * sb pinned.
+	 */
+	unsigned sb_pinned:1;
 };
 
 /*
@@ -69,6 +78,7 @@  struct writeback_control {
 struct bdi_writeback;
 int inode_wait(void *);
 void writeback_inodes_sb(struct super_block *);
+void writeback_inodes_sb_locked(struct super_block *);
 int writeback_inodes_sb_if_idle(struct super_block *);
 void sync_inodes_sb(struct super_block *);
 void writeback_inodes_wbc(struct writeback_control *wbc);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2c5d792..e73dbb7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -597,7 +597,7 @@  static void balance_dirty_pages(struct address_space *mapping,
 	    (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
 			       + global_page_state(NR_UNSTABLE_NFS))
 					  > background_thresh)))
-		bdi_start_writeback(bdi, NULL, 0);
+		bdi_start_writeback(bdi, NULL, 0, 0);
 }
 
 void set_page_dirty_balance(struct page *page, int page_mkwrite)