diff mbox

[updated] ext4: Fix file fragmentation during large file write.

Message ID 1223661776-20098-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com
State Superseded, archived
Headers show

Commit Message

Aneesh Kumar K.V Oct. 10, 2008, 6:02 p.m. UTC
The range_cyclic writeback mode use the address_space
writeback_index as the start index for writeback. With
delayed allocation we were updating writeback_index
wrongly resulting in highly fragmented file. Number of
extents reduced from 4000 to 27 for a 3GB file with
the below patch.

The patch also removes the range_cont writeback mode
added for ext4 delayed allocation. Instead we add
two new flags in writeback_control which control
the behaviour of write_cache_pages.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/inode.c           |  110 +++++++++++++++++++++++++-------------------
 include/linux/writeback.h |    5 ++-
 mm/page-writeback.c       |   11 +++--
 3 files changed, 73 insertions(+), 53 deletions(-)

Comments

Chris Mason Oct. 10, 2008, 7:15 p.m. UTC | #1
On Fri, 2008-10-10 at 23:32 +0530, Aneesh Kumar K.V wrote:
> The range_cyclic writeback mode use the address_space
> writeback_index as the start index for writeback. With
> delayed allocation we were updating writeback_index
> wrongly resulting in highly fragmented file. Number of
> extents reduced from 4000 to 27 for a 3GB file with
> the below patch.
> 
> The patch also removes the range_cont writeback mode
> added for ext4 delayed allocation. Instead we add
> two new flags in writeback_control which control
> the behaviour of write_cache_pages.
> 

I'm sorry, but I won't be able to test this until next wednesday.  In
general, I like the structure of it, and I can see this being useful for
other filesystems too.

-chris


--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nick Piggin Oct. 11, 2008, 10:51 a.m. UTC | #2
On Fri, Oct 10, 2008 at 11:32:56PM +0530, Aneesh Kumar K.V wrote:
> The range_cyclic writeback mode use the address_space
> writeback_index as the start index for writeback. With
> delayed allocation we were updating writeback_index
> wrongly resulting in highly fragmented file. Number of
> extents reduced from 4000 to 27 for a 3GB file with
> the below patch.
> 
> The patch also removes the range_cont writeback mode
> added for ext4 delayed allocation. Instead we add
> two new flags in writeback_control which control
> the behaviour of write_cache_pages.

The mm/page-writeback.c changes look OK, although it loks like you've
got rid of range_cont? Should we do a patch to get rid of it entirely
from the tree first?

I don't mind rediffing my patchset on top of this, but this seems smaller
and not strictly a bugfix so I would prefer to go the other way if you
agree.

Seems like it could be broken up into several patches (eg. pagevec_lookup).

The results look very nice.

> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
>  fs/ext4/inode.c           |  110 +++++++++++++++++++++++++-------------------
>  include/linux/writeback.h |    5 ++-
>  mm/page-writeback.c       |   11 +++--
>  3 files changed, 73 insertions(+), 53 deletions(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 7c2820e..f8890b9 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
>  	int ret = 0, err, nr_pages, i;
>  	unsigned long index, end;
>  	struct pagevec pvec;
> +	long pages_skipped;
>  
>  	BUG_ON(mpd->next_page <= mpd->first_page);
>  	pagevec_init(&pvec, 0);
> @@ -1655,20 +1656,30 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
>  	end = mpd->next_page - 1;
>  
>  	while (index <= end) {
> -		/* XXX: optimize tail */
> -		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
> +		/*
> +		 * We can use PAGECACHE_TAG_DIRTY lookup here because
> +		 * even though we have cleared the dirty flag on the page
> +		 * We still keep the page in the radix tree with tag
> +		 * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
> +		 * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
> +		 * which is called via the below writepage callback.
> +		 */
> +		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
> +					PAGECACHE_TAG_DIRTY,
> +					min(end - index,
> +					(pgoff_t)PAGEVEC_SIZE-1) + 1);
>  		if (nr_pages == 0)
>  			break;
>  		for (i = 0; i < nr_pages; i++) {
>  			struct page *page = pvec.pages[i];
>  
> -			index = page->index;
> -			if (index > end)
> -				break;
> -			index++;
> -
> +			pages_skipped = mpd->wbc->pages_skipped;
>  			err = mapping->a_ops->writepage(page, mpd->wbc);
> -			if (!err)
> +			if (!err && (pages_skipped == mpd->wbc->pages_skipped))
> +				/*
> +				 * have successfully written the page
> +				 * without skipping the same
> +				 */
>  				mpd->pages_written++;
>  			/*
>  			 * In error case, we have to continue because
> @@ -2104,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping,
>  			       struct writeback_control *wbc,
>  			       struct mpage_da_data *mpd)
>  {
> -	long to_write;
>  	int ret;
>  
>  	if (!mpd->get_block)
> @@ -2119,10 +2129,7 @@ static int mpage_da_writepages(struct address_space *mapping,
>  	mpd->pages_written = 0;
>  	mpd->retval = 0;
>  
> -	to_write = wbc->nr_to_write;
> -
>  	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
> -
>  	/*
>  	 * Handle last extent of pages
>  	 */
> @@ -2131,7 +2138,7 @@ static int mpage_da_writepages(struct address_space *mapping,
>  			mpage_da_submit_io(mpd);
>  	}
>  
> -	wbc->nr_to_write = to_write - mpd->pages_written;
> +	wbc->nr_to_write -= mpd->pages_written;
>  	return ret;
>  }
>  
> @@ -2360,12 +2367,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
>  static int ext4_da_writepages(struct address_space *mapping,
>  			      struct writeback_control *wbc)
>  {
> +	pgoff_t	index;
> +	int range_whole = 0;
>  	handle_t *handle = NULL;
> -	loff_t range_start = 0;
> +	long pages_written = 0;
>  	struct mpage_da_data mpd;
>  	struct inode *inode = mapping->host;
> +	int no_nrwrite_update, no_index_update;
>  	int needed_blocks, ret = 0, nr_to_writebump = 0;
> -	long to_write, pages_skipped = 0;
>  	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
>  
>  	/*
> @@ -2385,23 +2394,27 @@ static int ext4_da_writepages(struct address_space *mapping,
>  		nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
>  		wbc->nr_to_write = sbi->s_mb_stream_request;
>  	}
> +	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
> +		range_whole = 1;
>  
> -	if (!wbc->range_cyclic)
> -		/*
> -		 * If range_cyclic is not set force range_cont
> -		 * and save the old writeback_index
> -		 */
> -		wbc->range_cont = 1;
> -
> -	range_start =  wbc->range_start;
> -	pages_skipped = wbc->pages_skipped;
> +	if (wbc->range_cyclic)
> +		index = mapping->writeback_index;
> +	else
> +		index = wbc->range_start >> PAGE_CACHE_SHIFT;
>  
>  	mpd.wbc = wbc;
>  	mpd.inode = mapping->host;
>  
> -restart_loop:
> -	to_write = wbc->nr_to_write;
> -	while (!ret && to_write > 0) {
> +	/*
> +	 * we don't want write_cache_pages to update
> +	 * nr_to_write and writeback_index
> +	 */
> +	no_nrwrite_update = wbc->no_nrwrite_update;
> +	wbc->no_nrwrite_update = 1;
> +	no_index_update = wbc->no_index_update;
> +	wbc->no_index_update   = 1;
> +
> +	while (!ret && wbc->nr_to_write > 0) {
>  
>  		/*
>  		 * we  insert one extent at a time. So we need
> @@ -2422,48 +2435,49 @@ static int ext4_da_writepages(struct address_space *mapping,
>  			dump_stack();
>  			goto out_writepages;
>  		}
> -		to_write -= wbc->nr_to_write;
> -
>  		mpd.get_block = ext4_da_get_block_write;
>  		ret = mpage_da_writepages(mapping, wbc, &mpd);
>  
>  		ext4_journal_stop(handle);
>  
> -		if (mpd.retval == -ENOSPC)
> +		if (mpd.retval == -ENOSPC) {
> +			/* commit the transaction which would
> +			 * free blocks released in the transaction
> +			 * and try again
> +			 */
>  			jbd2_journal_force_commit_nested(sbi->s_journal);
> -
> -		/* reset the retry count */
> -		if (ret == MPAGE_DA_EXTENT_TAIL) {
> +			ret = 0;
> +		} else if (ret == MPAGE_DA_EXTENT_TAIL) {
>  			/*
>  			 * got one extent now try with
>  			 * rest of the pages
>  			 */
> -			to_write += wbc->nr_to_write;
> +			pages_written += mpd.pages_written;
>  			ret = 0;
> -		} else if (wbc->nr_to_write) {
> +		} else if (wbc->nr_to_write)
>  			/*
>  			 * There is no more writeout needed
>  			 * or we requested for a noblocking writeout
>  			 * and we found the device congested
>  			 */
> -			to_write += wbc->nr_to_write;
>  			break;
> -		}
> -		wbc->nr_to_write = to_write;
>  	}
>  
> -	if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
> -		/* We skipped pages in this loop */
> -		wbc->range_start = range_start;
> -		wbc->nr_to_write = to_write +
> -				wbc->pages_skipped - pages_skipped;
> -		wbc->pages_skipped = pages_skipped;
> -		goto restart_loop;
> -	}
> +	/* Update index */
> +	index += pages_written;
> +	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
> +		/*
> +		 * set the writeback_index so that range_cyclic
> +		 * mode will write it back later
> +		 */
> +		mapping->writeback_index = index;
>  
>  out_writepages:
> -	wbc->nr_to_write = to_write - nr_to_writebump;
> -	wbc->range_start = range_start;
> +	if (!no_nrwrite_update)
> +		wbc->no_nrwrite_update = 0;
> +	if (!no_index_update)
> +		wbc->no_index_update   = 0;
> +	wbc->nr_to_write -= nr_to_writebump;
>  	return ret;
>  }
>  
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index 12b15c5..b04287e 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -63,7 +63,10 @@ struct writeback_control {
>  	unsigned for_writepages:1;	/* This is a writepages() call */
>  	unsigned range_cyclic:1;	/* range_start is cyclic */
>  	unsigned more_io:1;		/* more io to be dispatched */
> -	unsigned range_cont:1;
> +
> +	/* write_cache_pages() control */
> +	unsigned no_nrwrite_update:1;	/* don't update nr_to_write */
> +	unsigned no_index_update:1;	/* don't update writeback_index */
>  };
>  
>  /*
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 24de8b6..a85930c 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -876,6 +876,7 @@ int write_cache_pages(struct address_space *mapping,
>  	pgoff_t end;		/* Inclusive */
>  	int scanned = 0;
>  	int range_whole = 0;
> +	long nr_to_write = wbc->nr_to_write;
>  
>  	if (wbc->nonblocking && bdi_write_congested(bdi)) {
>  		wbc->encountered_congestion = 1;
> @@ -939,7 +940,7 @@ int write_cache_pages(struct address_space *mapping,
>  				unlock_page(page);
>  				ret = 0;
>  			}
> -			if (ret || (--(wbc->nr_to_write) <= 0))
> +			if (ret || (--nr_to_write <= 0))
>  				done = 1;
>  			if (wbc->nonblocking && bdi_write_congested(bdi)) {
>  				wbc->encountered_congestion = 1;
> @@ -958,11 +959,13 @@ int write_cache_pages(struct address_space *mapping,
>  		index = 0;
>  		goto retry;
>  	}
> -	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
> +	if (!wbc->no_index_update &&
> +		(wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))) {
>  		mapping->writeback_index = index;
> +	}
> +	if (!wbc->no_nrwrite_update)
> +		wbc->nr_to_write = nr_to_write;
>  
> -	if (wbc->range_cont)
> -		wbc->range_start = index << PAGE_CACHE_SHIFT;
>  	return ret;
>  }
>  EXPORT_SYMBOL(write_cache_pages);
> -- 
> 1.6.0.1.285.g1070
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Aneesh Kumar K.V Oct. 11, 2008, 6:14 p.m. UTC | #3
On Sat, Oct 11, 2008 at 12:51:52PM +0200, Nick Piggin wrote:
> On Fri, Oct 10, 2008 at 11:32:56PM +0530, Aneesh Kumar K.V wrote:
> > The range_cyclic writeback mode use the address_space
> > writeback_index as the start index for writeback. With
> > delayed allocation we were updating writeback_index
> > wrongly resulting in highly fragmented file. Number of
> > extents reduced from 4000 to 27 for a 3GB file with
> > the below patch.
> > 
> > The patch also removes the range_cont writeback mode
> > added for ext4 delayed allocation. Instead we add
> > two new flags in writeback_control which control
> > the behaviour of write_cache_pages.
> 
> The mm/page-writeback.c changes look OK, although it loks like you've
> got rid of range_cont? Should we do a patch to get rid of it entirely
> from the tree first?
> 
> I don't mind rediffing my patchset on top of this, but this seems smaller
> and not strictly a bugfix so I would prefer to go the other way if you
> agree.
> 
> Seems like it could be broken up into several patches (eg. pagevec_lookup).
> 
> The results look very nice.

I actually tried to do that. But to do that and also achieve a working
bisect kernel, I will have to do the patches in below way

a) Introduce ext4_write_cache_pages
b) remove range_cont from write_cache_pages
c) Introduce the new flags to writeback_control
d) switch ext4 to use write_cache_pages.

I thought that involved lot of code which are later getting removed.
So i went for a single patch.

-aneesh
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Aneesh Kumar K.V Oct. 11, 2008, 7:07 p.m. UTC | #4
On Sat, Oct 11, 2008 at 11:44:26PM +0530, Aneesh Kumar K.V wrote:
> On Sat, Oct 11, 2008 at 12:51:52PM +0200, Nick Piggin wrote:
> > On Fri, Oct 10, 2008 at 11:32:56PM +0530, Aneesh Kumar K.V wrote:
> > > The range_cyclic writeback mode use the address_space
> > > writeback_index as the start index for writeback. With
> > > delayed allocation we were updating writeback_index
> > > wrongly resulting in highly fragmented file. Number of
> > > extents reduced from 4000 to 27 for a 3GB file with
> > > the below patch.
> > > 
> > > The patch also removes the range_cont writeback mode
> > > added for ext4 delayed allocation. Instead we add
> > > two new flags in writeback_control which control
> > > the behaviour of write_cache_pages.
> > 
> > The mm/page-writeback.c changes look OK, although it loks like you've
> > got rid of range_cont? Should we do a patch to get rid of it entirely
> > from the tree first?
> > 
> > I don't mind rediffing my patchset on top of this, but this seems smaller
> > and not strictly a bugfix so I would prefer to go the other way if you
> > agree.
> > 
> > Seems like it could be broken up into several patches (eg. pagevec_lookup).
> > 
> > The results look very nice.
> 
> I actually tried to do that. But to do that and also achieve a working
> bisect kernel, I will have to do the patches in below way
> 
> a) Introduce ext4_write_cache_pages
> b) remove range_cont from write_cache_pages
> c) Introduce the new flags to writeback_control
> d) switch ext4 to use write_cache_pages.
> 
> I thought that involved lot of code which are later getting removed.
> So i went for a single patch.
> 

Ok I did the split as below.

a) ext4: Use tag dirty lookup during mpage_da_submit_io
b) vfs: Remove the range_cont writeback mode.
c) vfs: Add no_nrwrite_update and no_index_update writeback control flags
d) ext4: Fix file fragmentation during large file write.

I have sent the updated patches to ext4 list and also to you. Let me
know what you think. The final change is same as the old patch. So
no new changes added

-aneesh

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o Oct. 16, 2008, 2:41 a.m. UTC | #5
On Wed, Oct 15, 2008 at 07:51:32PM -0400, Chris Mason wrote:
> 
> Just FYI, I ran this with compilebench -i 20 --makej and my log is full
> of these:
> 
> ext4_da_writepages: jbd2_start: 1024 pages, ino 520417; err -30
> Pid: 4072, comm: pdflush Not tainted 2.6.27 #2

That's from ext4_journal_start_sb:

       if (sb->s_flags & MS_RDONLY)
       	  return ERR_PTR(-EROFS);

What was the very first error in your log?  It looks like ext4 somehow
flagged some kind of filesystem error or aborted the journal due to
some failure, and log gets filled these messages.  We should probably 
should throttle these messages by simply putting a

       if (sb->s_flags & MS_RDONLY)
		return -ERNOFS;

at the beginning of ext4_da_writepages() so we don't fill the logs
with extraneous messages that obscure more important error messages.

     			      	      	   	     - Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger Oct. 16, 2008, 2:44 a.m. UTC | #6
On Oct 15, 2008  19:51 -0400, Chris Mason wrote:
> Just FYI, I ran this with compilebench -i 20 --makej and my log is full
> of these:
> 
> ext4_da_writepages: jbd2_start: 1024 pages, ino 520417; err -30
> Pid: 4072, comm: pdflush Not tainted 2.6.27 #2

-30 is -EROFS...  Was the filesystem remounted read-only because of
an error?

> Call Trace:
>  [<ffffffffa0048493>] ext4_da_writepages+0x171/0x2d3 [ext4]
>  [<ffffffff802336be>] ? pick_next_task_fair+0x80/0x91
>  [<ffffffff80228fa8>] ? source_load+0x2a/0x58
>  [<ffffffff8038e499>] ? __next_cpu+0x19/0x26
>  [<ffffffff8026748f>] do_writepages+0x28/0x37
>  [<ffffffff802a6b39>] __writeback_single_inode+0x14f/0x26d
>  [<ffffffff802a6fb7>] generic_sync_sb_inodes+0x1c1/0x2a2
>  [<ffffffff802a70a1>] sync_sb_inodes+0x9/0xb
>  [<ffffffff802a73dc>] writeback_inodes+0x64/0xad
>  [<ffffffff802675db>] wb_kupdate+0x9a/0x10c
>  [<ffffffff80267fd1>] ? pdflush+0x0/0x1e9
>  [<ffffffff80267fd1>] ? pdflush+0x0/0x1e9
>  [<ffffffff8026810e>] pdflush+0x13d/0x1e9
>  [<ffffffff80267541>] ? wb_kupdate+0x0/0x10c
>  [<ffffffff80248222>] kthread+0x49/0x77
>  [<ffffffff8020c5e9>] child_rip+0xa/0x11
>  [<ffffffff802481d9>] ? kthread+0x0/0x77
>  [<ffffffff8020c5df>] ? child_rip+0x0/0x11
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7c2820e..f8890b9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1648,6 +1648,7 @@  static int mpage_da_submit_io(struct mpage_da_data *mpd)
 	int ret = 0, err, nr_pages, i;
 	unsigned long index, end;
 	struct pagevec pvec;
+	long pages_skipped;
 
 	BUG_ON(mpd->next_page <= mpd->first_page);
 	pagevec_init(&pvec, 0);
@@ -1655,20 +1656,30 @@  static int mpage_da_submit_io(struct mpage_da_data *mpd)
 	end = mpd->next_page - 1;
 
 	while (index <= end) {
-		/* XXX: optimize tail */
-		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		/*
+		 * We can use PAGECACHE_TAG_DIRTY lookup here because
+		 * even though we have cleared the dirty flag on the page
+		 * We still keep the page in the radix tree with tag
+		 * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
+		 * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
+		 * which is called via the below writepage callback.
+		 */
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					PAGECACHE_TAG_DIRTY,
+					min(end - index,
+					(pgoff_t)PAGEVEC_SIZE-1) + 1);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
-			index = page->index;
-			if (index > end)
-				break;
-			index++;
-
+			pages_skipped = mpd->wbc->pages_skipped;
 			err = mapping->a_ops->writepage(page, mpd->wbc);
-			if (!err)
+			if (!err && (pages_skipped == mpd->wbc->pages_skipped))
+				/*
+				 * have successfully written the page
+				 * without skipping the same
+				 */
 				mpd->pages_written++;
 			/*
 			 * In error case, we have to continue because
@@ -2104,7 +2115,6 @@  static int mpage_da_writepages(struct address_space *mapping,
 			       struct writeback_control *wbc,
 			       struct mpage_da_data *mpd)
 {
-	long to_write;
 	int ret;
 
 	if (!mpd->get_block)
@@ -2119,10 +2129,7 @@  static int mpage_da_writepages(struct address_space *mapping,
 	mpd->pages_written = 0;
 	mpd->retval = 0;
 
-	to_write = wbc->nr_to_write;
-
 	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
-
 	/*
 	 * Handle last extent of pages
 	 */
@@ -2131,7 +2138,7 @@  static int mpage_da_writepages(struct address_space *mapping,
 			mpage_da_submit_io(mpd);
 	}
 
-	wbc->nr_to_write = to_write - mpd->pages_written;
+	wbc->nr_to_write -= mpd->pages_written;
 	return ret;
 }
 
@@ -2360,12 +2367,14 @@  static int ext4_da_writepages_trans_blocks(struct inode *inode)
 static int ext4_da_writepages(struct address_space *mapping,
 			      struct writeback_control *wbc)
 {
+	pgoff_t	index;
+	int range_whole = 0;
 	handle_t *handle = NULL;
-	loff_t range_start = 0;
+	long pages_written = 0;
 	struct mpage_da_data mpd;
 	struct inode *inode = mapping->host;
+	int no_nrwrite_update, no_index_update;
 	int needed_blocks, ret = 0, nr_to_writebump = 0;
-	long to_write, pages_skipped = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 
 	/*
@@ -2385,23 +2394,27 @@  static int ext4_da_writepages(struct address_space *mapping,
 		nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
 		wbc->nr_to_write = sbi->s_mb_stream_request;
 	}
+	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+		range_whole = 1;
 
-	if (!wbc->range_cyclic)
-		/*
-		 * If range_cyclic is not set force range_cont
-		 * and save the old writeback_index
-		 */
-		wbc->range_cont = 1;
-
-	range_start =  wbc->range_start;
-	pages_skipped = wbc->pages_skipped;
+	if (wbc->range_cyclic)
+		index = mapping->writeback_index;
+	else
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
 
 	mpd.wbc = wbc;
 	mpd.inode = mapping->host;
 
-restart_loop:
-	to_write = wbc->nr_to_write;
-	while (!ret && to_write > 0) {
+	/*
+	 * we don't want write_cache_pages to update
+	 * nr_to_write and writeback_index
+	 */
+	no_nrwrite_update = wbc->no_nrwrite_update;
+	wbc->no_nrwrite_update = 1;
+	no_index_update = wbc->no_index_update;
+	wbc->no_index_update   = 1;
+
+	while (!ret && wbc->nr_to_write > 0) {
 
 		/*
 		 * we  insert one extent at a time. So we need
@@ -2422,48 +2435,49 @@  static int ext4_da_writepages(struct address_space *mapping,
 			dump_stack();
 			goto out_writepages;
 		}
-		to_write -= wbc->nr_to_write;
-
 		mpd.get_block = ext4_da_get_block_write;
 		ret = mpage_da_writepages(mapping, wbc, &mpd);
 
 		ext4_journal_stop(handle);
 
-		if (mpd.retval == -ENOSPC)
+		if (mpd.retval == -ENOSPC) {
+			/* commit the transaction which would
+			 * free blocks released in the transaction
+			 * and try again
+			 */
 			jbd2_journal_force_commit_nested(sbi->s_journal);
-
-		/* reset the retry count */
-		if (ret == MPAGE_DA_EXTENT_TAIL) {
+			ret = 0;
+		} else if (ret == MPAGE_DA_EXTENT_TAIL) {
 			/*
 			 * got one extent now try with
 			 * rest of the pages
 			 */
-			to_write += wbc->nr_to_write;
+			pages_written += mpd.pages_written;
 			ret = 0;
-		} else if (wbc->nr_to_write) {
+		} else if (wbc->nr_to_write)
 			/*
 			 * There is no more writeout needed
 			 * or we requested for a noblocking writeout
 			 * and we found the device congested
 			 */
-			to_write += wbc->nr_to_write;
 			break;
-		}
-		wbc->nr_to_write = to_write;
 	}
 
-	if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
-		/* We skipped pages in this loop */
-		wbc->range_start = range_start;
-		wbc->nr_to_write = to_write +
-				wbc->pages_skipped - pages_skipped;
-		wbc->pages_skipped = pages_skipped;
-		goto restart_loop;
-	}
+	/* Update index */
+	index += pages_written;
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		/*
+		 * set the writeback_index so that range_cyclic
+		 * mode will write it back later
+		 */
+		mapping->writeback_index = index;
 
 out_writepages:
-	wbc->nr_to_write = to_write - nr_to_writebump;
-	wbc->range_start = range_start;
+	if (!no_nrwrite_update)
+		wbc->no_nrwrite_update = 0;
+	if (!no_index_update)
+		wbc->no_index_update   = 0;
+	wbc->nr_to_write -= nr_to_writebump;
 	return ret;
 }
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 12b15c5..b04287e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -63,7 +63,10 @@  struct writeback_control {
 	unsigned for_writepages:1;	/* This is a writepages() call */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned more_io:1;		/* more io to be dispatched */
-	unsigned range_cont:1;
+
+	/* write_cache_pages() control */
+	unsigned no_nrwrite_update:1;	/* don't update nr_to_write */
+	unsigned no_index_update:1;	/* don't update writeback_index */
 };
 
 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 24de8b6..a85930c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -876,6 +876,7 @@  int write_cache_pages(struct address_space *mapping,
 	pgoff_t end;		/* Inclusive */
 	int scanned = 0;
 	int range_whole = 0;
+	long nr_to_write = wbc->nr_to_write;
 
 	if (wbc->nonblocking && bdi_write_congested(bdi)) {
 		wbc->encountered_congestion = 1;
@@ -939,7 +940,7 @@  int write_cache_pages(struct address_space *mapping,
 				unlock_page(page);
 				ret = 0;
 			}
-			if (ret || (--(wbc->nr_to_write) <= 0))
+			if (ret || (--nr_to_write <= 0))
 				done = 1;
 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
 				wbc->encountered_congestion = 1;
@@ -958,11 +959,13 @@  int write_cache_pages(struct address_space *mapping,
 		index = 0;
 		goto retry;
 	}
-	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+	if (!wbc->no_index_update &&
+		(wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))) {
 		mapping->writeback_index = index;
+	}
+	if (!wbc->no_nrwrite_update)
+		wbc->nr_to_write = nr_to_write;
 
-	if (wbc->range_cont)
-		wbc->range_start = index << PAGE_CACHE_SHIFT;
 	return ret;
 }
 EXPORT_SYMBOL(write_cache_pages);