diff mbox

[stable,00/16] : Fix writeback regressions in 2.6.32

Message ID 4C636A79.7020704@fusionio.com
State Rejected
Delegated to: Stefan Bader
Headers show

Commit Message

Jens Axboe Aug. 12, 2010, 3:28 a.m. UTC
On 08/11/2010 04:12 PM, Jens Axboe wrote:
> On 08/11/2010 03:59 PM, Greg KH wrote:
>> On Tue, Aug 10, 2010 at 11:28:48AM +0200, Stefan Bader wrote:
>>> Impact: 2.6.32 is facing some serious performance regression which are visible
>>> plainly when unmounting filesystems as async writeback degrades to sync in
>>> that case. But there seem to be other less visible problems as well.
>>> I believe this set of patches also would be benefitial to go into upstream
>>> stable.
>>>
>>> Fix: This series of patches was picked and backported when needed to apply
>>> to 2.6.32.y (there is another set for 2.6.34.y[1]). It probably does not
>>> solve all currently known problems but testing has shown considerable
>>> improvements. At the beginning of the set there are some patches that were
>>> picked to make the later patches applicable. Only one of them is a bit more
>>> intrusive the others rather trivial.
>>>
>>> Testcase: Unmounting a fs that still has larger amounts of data to write back
>>> to disk will take noticably longer time to complete (IIRC might be 30min).
>>> Also general responsiveness was reported to improve. Currently there are no
>>> known regressions reported when testing this backport.
>>
>> Jens, what do you think about this series?
> 
> I think it is scary :-)
> 
> But seriously, we need to do something about this umount regression and
> there's likely no way around this series. Doing the full backport is
> the best option. Stefan, to what extent did you test this backport?

So I discussed this with Jan tonight, and a better solution may be
something similar to my first attempt at fixing this patch earlier.
Basically just pass in the info on whether this is done from umount or
not. The bug here is that umount already holds the sb sem when doing
WB_SYNC_NONE writeback, so we skip all dirty inodes for that pass and
end up doing everything as WB_SYNC_ALL. If we pass in this lock
information, then wb_writeback() knows when to skip pinning the sb.

The below patch is totally untested. Stefan, care to give it a spin with
the test case? This would be a lot less intrusive than the full
backport, which is primarily backporting cleanups and optimizations that
we really need not put in 2.6.32-stable if at all avoidable.
diff mbox

Patch

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9d5360c..eb0ad8c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -44,6 +44,7 @@  struct wb_writeback_args {
 	int for_kupdate:1;
 	int range_cyclic:1;
 	int for_background:1;
+	int locked:1;
 };
 
 /*
@@ -251,13 +252,14 @@  static void bdi_sync_writeback(struct backing_dev_info *bdi,
  *
  */
 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-			 long nr_pages)
+			 long nr_pages, int locked)
 {
 	struct wb_writeback_args args = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_NONE,
 		.nr_pages	= nr_pages,
 		.range_cyclic	= 1,
+		.locked		= locked,
 	};
 
 	/*
@@ -584,7 +586,7 @@  static int pin_sb_for_writeback(struct writeback_control *wbc,
 	/*
 	 * Caller must already hold the ref for this
 	 */
-	if (wbc->sync_mode == WB_SYNC_ALL) {
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->locked) {
 		WARN_ON(!rwsem_is_locked(&sb->s_umount));
 		return 0;
 	}
@@ -757,6 +759,7 @@  static long wb_writeback(struct bdi_writeback *wb,
 		.older_than_this	= NULL,
 		.for_kupdate		= args->for_kupdate,
 		.range_cyclic		= args->range_cyclic,
+		.locked			= args->locked,
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
@@ -905,7 +908,7 @@  long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 		 * If this isn't a data integrity operation, just notify
 		 * that we have seen this work and we are now starting it.
 		 */
-		if (args.sync_mode == WB_SYNC_NONE)
+		if (args.sync_mode == WB_SYNC_NONE && !args.locked)
 			wb_clear_pending(wb, work);
 
 		wrote += wb_writeback(wb, &args);
@@ -914,7 +917,7 @@  long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 		 * This is a data integrity writeback, so only do the
 		 * notification when we have completed the work.
 		 */
-		if (args.sync_mode == WB_SYNC_ALL)
+		if (args.sync_mode == WB_SYNC_ALL || args.locked)
 			wb_clear_pending(wb, work);
 	}
 
@@ -1193,13 +1196,14 @@  static void wait_sb_inodes(struct super_block *sb)
 /**
  * writeback_inodes_sb	-	writeback dirty inodes from given super_block
  * @sb: the superblock
+ * @locked: sb already pinned
  *
  * Start writeback on some inodes on this super_block. No guarantees are made
  * on how many (if any) will be written, and this function does not wait
  * for IO completion of submitted IO. The number of pages submitted is
  * returned.
  */
-void writeback_inodes_sb(struct super_block *sb)
+void writeback_inodes_sb(struct super_block *sb, int locked)
 {
 	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
 	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
@@ -1208,7 +1212,7 @@  void writeback_inodes_sb(struct super_block *sb)
 	nr_to_write = nr_dirty + nr_unstable +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 
-	bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
+	bdi_start_writeback(sb->s_bdi, sb, nr_to_write, locked);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 
diff --git a/fs/sync.c b/fs/sync.c
index d104591..41d543e 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -37,7 +37,7 @@  static int __sync_filesystem(struct super_block *sb, int wait)
 	/* Avoid doing twice syncing and cache pruning for quota sync */
 	if (!wait) {
 		writeout_quota_sb(sb, -1);
-		writeback_inodes_sb(sb);
+		writeback_inodes_sb(sb, 1);
 	} else {
 		sync_quota_sb(sb, -1);
 		sync_inodes_sb(sb);
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 076ca50..d16a8eb 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -62,7 +62,7 @@ 
  */
 static void shrink_liability(struct ubifs_info *c, int nr_to_write)
 {
-	writeback_inodes_sb(c->vfs_sb);
+	writeback_inodes_sb(c->vfs_sb, 0);
 }
 
 /**
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index b449e73..ce997ba 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -102,7 +102,7 @@  int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-				long nr_pages);
+				long nr_pages, int locked);
 int bdi_writeback_task(struct bdi_writeback *wb);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 66ebddc..d15eb2b 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -52,6 +52,8 @@  struct writeback_control {
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned more_io:1;		/* more io to be dispatched */
+	unsigned locked:1;		/* sb pinned for WB_SYNC_NONE */
+
 	/*
 	 * write_cache_pages() won't update wbc->nr_to_write and
 	 * mapping->writeback_index if no_nrwrite_index_update
@@ -68,7 +70,7 @@  struct writeback_control {
  */	
 struct bdi_writeback;
 int inode_wait(void *);
-void writeback_inodes_sb(struct super_block *);
+void writeback_inodes_sb(struct super_block *, int);
 void sync_inodes_sb(struct super_block *);
 void writeback_inodes_wbc(struct writeback_control *wbc);
 long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2c5d792..e73dbb7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -597,7 +597,7 @@  static void balance_dirty_pages(struct address_space *mapping,
 	    (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
 			       + global_page_state(NR_UNSTABLE_NFS))
 					  > background_thresh)))
-		bdi_start_writeback(bdi, NULL, 0);
+		bdi_start_writeback(bdi, NULL, 0, 0);
 }
 
 void set_page_dirty_balance(struct page *page, int page_mkwrite)