Patchwork [18/29] ext4: Restructure writeback path

login
register
mail settings
Submitter Jan Kara
Date April 8, 2013, 9:32 p.m.
Message ID <1365456754-29373-19-git-send-email-jack@suse.cz>
Download mbox | patch
Permalink /patch/234902/
State Superseded
Headers show

Comments

Jan Kara - April 8, 2013, 9:32 p.m.
There are two issues with current writeback path in ext4. For one we
don't necessarily map complete pages when blocksize < pagesize and thus
needn't do any writeback in one iteration. We always map some blocks
though so we will eventually finish mapping the page. Just if writeback
races with other operations on the file, forward progress is not really
guaranteed. The second problem is that current code structure makes it
hard to associate all the bios to some range of pages with one io_end
structure so that unwritten extents can be converted after all the bios
are finished. This will be especially difficult later when io_end will
be associated with reserved transaction handle.

We restructure the writeback path to a relatively simple loop which
first prepares extent of pages, then maps one or more extents so that
no page is partially mapped, and once page is fully mapped it is
submitted for IO. We keep all the mapping and IO submission information
in mpage_da_data structure to somewhat reduce stack usage. Resulting
code is somewhat shorter than the old one and hopefully also easier to
read.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/ext4.h              |   15 -
 fs/ext4/inode.c             |  978 +++++++++++++++++++++----------------------
 include/trace/events/ext4.h |   64 ++-
 3 files changed, 508 insertions(+), 549 deletions(-)
Zheng Liu - May 8, 2013, 3:48 a.m.
On Mon, Apr 08, 2013 at 11:32:23PM +0200, Jan Kara wrote:
> There are two issues with current writeback path in ext4. For one we
> don't necessarily map complete pages when blocksize < pagesize and thus
> needn't do any writeback in one iteration. We always map some blocks
> though so we will eventually finish mapping the page. Just if writeback
> races with other operations on the file, forward progress is not really
> guaranteed. The second problem is that current code structure makes it
> hard to associate all the bios to some range of pages with one io_end
> structure so that unwritten extents can be converted after all the bios
> are finished. This will be especially difficult later when io_end will
> be associated with reserved transaction handle.
> 
> We restructure the writeback path to a relatively simple loop which
> first prepares extent of pages, then maps one or more extents so that
> no page is partially mapped, and once page is fully mapped it is
> submitted for IO. We keep all the mapping and IO submission information
> in mpage_da_data structure to somewhat reduce stack usage. Resulting
> code is somewhat shorter than the old one and hopefully also easier to
> read.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>

One nit below.  Otherwise the patch looks good to be.
Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>

> ---
>  fs/ext4/ext4.h              |   15 -
>  fs/ext4/inode.c             |  978 +++++++++++++++++++++----------------------
>  include/trace/events/ext4.h |   64 ++-
>  3 files changed, 508 insertions(+), 549 deletions(-)
...
>  /*
> - * write_cache_pages_da - walk the list of dirty pages of the given
> - * address space and accumulate pages that need writing, and call
> - * mpage_da_map_and_submit to map a single contiguous memory region
> - * and then write them.
> + * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
> + * 				 and underlying extent to map
> + *
> + * @mpd - where to look for pages
> + *
> + * Walk dirty pages in the mapping while they are contiguous and lock them.
> + * While pages are fully mapped submit them for IO. When we find a page which
> + * isn't mapped we start accumulating extent of buffers underlying these pages
> + * that needs mapping (formed by either delayed or unwritten buffers). The
> + * extent found is returned in @mpd structure (starting at mpd->lblk with
> + * length mpd->len blocks).
>   */
> -static int write_cache_pages_da(handle_t *handle,
> -				struct address_space *mapping,
> -				struct writeback_control *wbc,
> -				struct mpage_da_data *mpd,
> -				pgoff_t *done_index)
> +static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
>  {
> -	struct buffer_head	*bh, *head;
> -	struct inode		*inode = mapping->host;
> -	struct pagevec		pvec;
> -	unsigned int		nr_pages;
> -	sector_t		logical;
> -	pgoff_t			index, end;
> -	long			nr_to_write = wbc->nr_to_write;
> -	int			i, tag, ret = 0;
> -
> -	memset(mpd, 0, sizeof(struct mpage_da_data));
> -	mpd->wbc = wbc;
> -	mpd->inode = inode;
> -	pagevec_init(&pvec, 0);
> -	index = wbc->range_start >> PAGE_CACHE_SHIFT;
> -	end = wbc->range_end >> PAGE_CACHE_SHIFT;
> +	struct address_space *mapping = mpd->inode->i_mapping;
> +	struct pagevec pvec;
> +	unsigned int nr_pages;
> +	pgoff_t index = mpd->first_page;
> +	pgoff_t end = mpd->last_page;
> +	bool first_page_found = false;
> +	int tag;
> +	int i, err = 0;
> +	int blkbits = mpd->inode->i_blkbits;
> +	ext4_lblk_t lblk;
> +	struct buffer_head *head;
>  
> -	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
> +	if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
>  		tag = PAGECACHE_TAG_TOWRITE;
>  	else
>  		tag = PAGECACHE_TAG_DIRTY;
>  
> -	*done_index = index;
> +	mpd->map.m_len = 0;
> +	mpd->next_page = index;

Forgot to call pagevec_init(&pvec, 0) here.

Regards,
                                                - Zheng

>  	while (index <= end) {
>  		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
>  			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
>  		if (nr_pages == 0)
> -			return 0;
> +			goto out;
>  
>  		for (i = 0; i < nr_pages; i++) {
>  			struct page *page = pvec.pages[i];
> @@ -2145,31 +2138,21 @@ static int write_cache_pages_da(handle_t *handle,
>  			if (page->index > end)
>  				goto out;
>  
> -			*done_index = page->index + 1;
> -
> -			/*
> -			 * If we can't merge this page, and we have
> -			 * accumulated an contiguous region, write it
> -			 */
> -			if ((mpd->next_page != page->index) &&
> -			    (mpd->next_page != mpd->first_page)) {
> -				mpage_da_map_and_submit(mpd);
> -				goto ret_extent_tail;
> -			}
> +			/* If we can't merge this page, we are done. */
> +			if (first_page_found && mpd->next_page != page->index)
> +				goto out;
>  
>  			lock_page(page);
> -
>  			/*
> -			 * If the page is no longer dirty, or its
> -			 * mapping no longer corresponds to inode we
> -			 * are writing (which means it has been
> -			 * truncated or invalidated), or the page is
> -			 * already under writeback and we are not
> -			 * doing a data integrity writeback, skip the page
> +			 * If the page is no longer dirty, or its mapping no
> +			 * longer corresponds to inode we are writing (which
> +			 * means it has been truncated or invalidated), or the
> +			 * page is already under writeback and we are not doing
> +			 * a data integrity writeback, skip the page
>  			 */
>  			if (!PageDirty(page) ||
>  			    (PageWriteback(page) &&
> -			     (wbc->sync_mode == WB_SYNC_NONE)) ||
> +			     (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
>  			    unlikely(page->mapping != mapping)) {
>  				unlock_page(page);
>  				continue;
> @@ -2178,101 +2161,60 @@ static int write_cache_pages_da(handle_t *handle,
>  			wait_on_page_writeback(page);
>  			BUG_ON(PageWriteback(page));
>  
> -			/*
> -			 * If we have inline data and arrive here, it means that
> -			 * we will soon create the block for the 1st page, so
> -			 * we'd better clear the inline data here.
> -			 */
> -			if (ext4_has_inline_data(inode)) {
> -				BUG_ON(ext4_test_inode_state(inode,
> -						EXT4_STATE_MAY_INLINE_DATA));
> -				ext4_destroy_inline_data(handle, inode);
> -			}
> -
> -			if (mpd->next_page != page->index)
> +			if (!first_page_found) {
>  				mpd->first_page = page->index;
> +				first_page_found = true;
> +			}
>  			mpd->next_page = page->index + 1;
> -			logical = (sector_t) page->index <<
> -				(PAGE_CACHE_SHIFT - inode->i_blkbits);
> +			lblk = ((ext4_lblk_t)page->index) <<
> +				(PAGE_CACHE_SHIFT - blkbits);
>  
>  			/* Add all dirty buffers to mpd */
>  			head = page_buffers(page);
> -			bh = head;
> -			do {
> -				BUG_ON(buffer_locked(bh));
> -				/*
> -				 * We need to try to allocate unmapped blocks
> -				 * in the same page.  Otherwise we won't make
> -				 * progress with the page in ext4_writepage
> -				 */
> -				if (ext4_bh_delay_or_unwritten(NULL, bh)) {
> -					mpage_add_bh_to_extent(mpd, logical,
> -							       bh->b_state);
> -					if (mpd->io_done)
> -						goto ret_extent_tail;
> -				} else if (buffer_dirty(bh) &&
> -					   buffer_mapped(bh)) {
> -					/*
> -					 * mapped dirty buffer. We need to
> -					 * update the b_state because we look
> -					 * at b_state in mpage_da_map_blocks.
> -					 * We don't update b_size because if we
> -					 * find an unmapped buffer_head later
> -					 * we need to use the b_state flag of
> -					 * that buffer_head.
> -					 */
> -					if (mpd->b_size == 0)
> -						mpd->b_state =
> -							bh->b_state & BH_FLAGS;
> -				}
> -				logical++;
> -			} while ((bh = bh->b_this_page) != head);
> -
> -			if (nr_to_write > 0) {
> -				nr_to_write--;
> -				if (nr_to_write == 0 &&
> -				    wbc->sync_mode == WB_SYNC_NONE)
> -					/*
> -					 * We stop writing back only if we are
> -					 * not doing integrity sync. In case of
> -					 * integrity sync we have to keep going
> -					 * because someone may be concurrently
> -					 * dirtying pages, and we might have
> -					 * synced a lot of newly appeared dirty
> -					 * pages, but have not synced all of the
> -					 * old dirty pages.
> -					 */
> +			if (!add_page_bufs_to_extent(mpd, head, head, lblk))
> +				goto out;
> +			/* So far everything mapped? Submit the page for IO. */
> +			if (mpd->map.m_len == 0) {
> +				err = mpage_submit_page(mpd, page);
> +				if (err < 0)
>  					goto out;
>  			}
> +
> +			/*
> +			 * Accumulated enough dirty pages? This doesn't apply
> +			 * to WB_SYNC_ALL mode. For integrity sync we have to
> +			 * keep going because someone may be concurrently
> +			 * dirtying pages, and we might have synced a lot of
> +			 * newly appeared dirty pages, but have not synced all
> +			 * of the old dirty pages.
> +			 */
> +			if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
> +			    mpd->next_page - mpd->first_page >=
> +							mpd->wbc->nr_to_write)
> +				goto out;
>  		}
>  		pagevec_release(&pvec);
>  		cond_resched();
>  	}
>  	return 0;
> -ret_extent_tail:
> -	ret = MPAGE_DA_EXTENT_TAIL;
>  out:
>  	pagevec_release(&pvec);
> -	cond_resched();
> -	return ret;
> +	return err;
>  }
>  
> -
>  static int ext4_da_writepages(struct address_space *mapping,
>  			      struct writeback_control *wbc)
>  {
> -	pgoff_t	index;
> +	pgoff_t	writeback_index = 0;
> +	long nr_to_write = wbc->nr_to_write;
>  	int range_whole = 0;
> +	int cycled = 1;
>  	handle_t *handle = NULL;
>  	struct mpage_da_data mpd;
>  	struct inode *inode = mapping->host;
> -	int pages_written = 0;
> -	int range_cyclic, cycled = 1, io_done = 0;
>  	int needed_blocks, ret = 0;
> -	loff_t range_start = wbc->range_start;
>  	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
> -	pgoff_t done_index = 0;
> -	pgoff_t end;
> +	bool done;
>  	struct blk_plug plug;
>  
>  	trace_ext4_da_writepages(inode, wbc);
> @@ -2298,40 +2240,65 @@ static int ext4_da_writepages(struct address_space *mapping,
>  	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
>  		return -EROFS;
>  
> +	/*
> +	 * If we have inline data and arrive here, it means that
> +	 * we will soon create the block for the 1st page, so
> +	 * we'd better clear the inline data here.
> +	 */
> +	if (ext4_has_inline_data(inode)) {
> +		/* Just inode will be modified... */
> +		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
> +		if (IS_ERR(handle)) {
> +			ret = PTR_ERR(handle);
> +			goto out_writepages;
> +		}
> +		BUG_ON(ext4_test_inode_state(inode,
> +				EXT4_STATE_MAY_INLINE_DATA));
> +		ext4_destroy_inline_data(handle, inode);
> +		ext4_journal_stop(handle);
> +	}
> +
>  	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
>  		range_whole = 1;
>  
> -	range_cyclic = wbc->range_cyclic;
>  	if (wbc->range_cyclic) {
> -		index = mapping->writeback_index;
> -		if (index)
> +		writeback_index = mapping->writeback_index;
> +		if (writeback_index)
>  			cycled = 0;
> -		wbc->range_start = index << PAGE_CACHE_SHIFT;
> -		wbc->range_end  = LLONG_MAX;
> -		wbc->range_cyclic = 0;
> -		end = -1;
> +		mpd.first_page = writeback_index;
> +		mpd.last_page = -1;
>  	} else {
> -		index = wbc->range_start >> PAGE_CACHE_SHIFT;
> -		end = wbc->range_end >> PAGE_CACHE_SHIFT;
> +		mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
> +		mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
>  	}
>  
> +	mpd.inode = inode;
> +	mpd.wbc = wbc;
> +	ext4_io_submit_init(&mpd.io_submit, wbc);
>  retry:
>  	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
> -		tag_pages_for_writeback(mapping, index, end);
> -
> +		tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
> +	done = false;
>  	blk_start_plug(&plug);
> -	while (!ret && wbc->nr_to_write > 0) {
> +	while (!done && mpd.first_page <= mpd.last_page) {
> +		/* For each extent of pages we use new io_end */
> +		mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
> +		if (!mpd.io_submit.io_end) {
> +			ret = -ENOMEM;
> +			break;
> +		}
>  
>  		/*
> -		 * we  insert one extent at a time. So we need
> -		 * credit needed for single extent allocation.
> -		 * journalled mode is currently not supported
> -		 * by delalloc
> +		 * We have two constraints: We find one extent to map and we
> +		 * must always write out whole page (makes a difference when
> +		 * blocksize < pagesize) so that we don't block on IO when we
> +		 * try to write out the rest of the page. Journalled mode is
> +		 * not supported by delalloc.
>  		 */
>  		BUG_ON(ext4_should_journal_data(inode));
>  		needed_blocks = ext4_da_writepages_trans_blocks(inode);
>  
> -		/* start a new transaction*/
> +		/* start a new transaction */
>  		handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
>  					    needed_blocks);
>  		if (IS_ERR(handle)) {
> @@ -2339,76 +2306,67 @@ retry:
>  			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
>  			       "%ld pages, ino %lu; err %d", __func__,
>  				wbc->nr_to_write, inode->i_ino, ret);
> -			blk_finish_plug(&plug);
> -			goto out_writepages;
> +			/* Release allocated io_end */
> +			ext4_put_io_end(mpd.io_submit.io_end);
> +			break;
>  		}
>  
> -		/*
> -		 * Now call write_cache_pages_da() to find the next
> -		 * contiguous region of logical blocks that need
> -		 * blocks to be allocated by ext4 and submit them.
> -		 */
> -		ret = write_cache_pages_da(handle, mapping,
> -					   wbc, &mpd, &done_index);
> -		/*
> -		 * If we have a contiguous extent of pages and we
> -		 * haven't done the I/O yet, map the blocks and submit
> -		 * them for I/O.
> -		 */
> -		if (!mpd.io_done && mpd.next_page != mpd.first_page) {
> -			mpage_da_map_and_submit(&mpd);
> -			ret = MPAGE_DA_EXTENT_TAIL;
> +		trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
> +		ret = mpage_prepare_extent_to_map(&mpd);
> +		if (!ret) {
> +			if (mpd.map.m_len)
> +				ret = mpage_map_and_submit_extent(handle, &mpd);
> +			else {
> +				/*
> +				 * We scanned the whole range (or exhausted
> +				 * nr_to_write), submitted what was mapped and
> +				 * didn't find anything needing mapping. We are
> +				 * done.
> +				 */
> +				done = true;
> +			}
>  		}
> -		trace_ext4_da_write_pages(inode, &mpd);
> -		wbc->nr_to_write -= mpd.pages_written;
> -
>  		ext4_journal_stop(handle);
> -
> -		if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
> -			/* commit the transaction which would
> +		/* Submit prepared bio */
> +		ext4_io_submit(&mpd.io_submit);
> +		/* Unlock pages we didn't use */
> +		mpage_release_unused_pages(&mpd, false);
> +		/* Drop our io_end reference we got from init */
> +		ext4_put_io_end(mpd.io_submit.io_end);
> +
> +		if (ret == -ENOSPC && sbi->s_journal) {
> +			/*
> +			 * Commit the transaction which would
>  			 * free blocks released in the transaction
>  			 * and try again
>  			 */
>  			jbd2_journal_force_commit_nested(sbi->s_journal);
>  			ret = 0;
> -		} else if (ret == MPAGE_DA_EXTENT_TAIL) {
> -			/*
> -			 * Got one extent now try with rest of the pages.
> -			 * If mpd.retval is set -EIO, journal is aborted.
> -			 * So we don't need to write any more.
> -			 */
> -			pages_written += mpd.pages_written;
> -			ret = mpd.retval;
> -			io_done = 1;
> -		} else if (wbc->nr_to_write)
> -			/*
> -			 * There is no more writeout needed
> -			 * or we requested for a noblocking writeout
> -			 * and we found the device congested
> -			 */
> +			continue;
> +		}
> +		/* Fatal error - ENOMEM, EIO... */
> +		if (ret)
>  			break;
>  	}
>  	blk_finish_plug(&plug);
> -	if (!io_done && !cycled) {
> +	if (!ret && !cycled) {
>  		cycled = 1;
> -		index = 0;
> -		wbc->range_start = index << PAGE_CACHE_SHIFT;
> -		wbc->range_end  = mapping->writeback_index - 1;
> +		mpd.last_page = writeback_index - 1;
> +		mpd.first_page = 0;
>  		goto retry;
>  	}
>  
>  	/* Update index */
> -	wbc->range_cyclic = range_cyclic;
>  	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
>  		/*
> -		 * set the writeback_index so that range_cyclic
> +		 * Set the writeback_index so that range_cyclic
>  		 * mode will write it back later
>  		 */
> -		mapping->writeback_index = done_index;
> +		mapping->writeback_index = mpd.first_page;
>  
>  out_writepages:
> -	wbc->range_start = range_start;
> -	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
> +	trace_ext4_da_writepages_result(inode, wbc, ret,
> +					nr_to_write - wbc->nr_to_write);
>  	return ret;
>  }
>  
> diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
> index a601bb3..203dcd5 100644
> --- a/include/trace/events/ext4.h
> +++ b/include/trace/events/ext4.h
> @@ -332,43 +332,59 @@ TRACE_EVENT(ext4_da_writepages,
>  );
>  
>  TRACE_EVENT(ext4_da_write_pages,
> -	TP_PROTO(struct inode *inode, struct mpage_da_data *mpd),
> +	TP_PROTO(struct inode *inode, pgoff_t first_page,
> +		 struct writeback_control *wbc),
>  
> -	TP_ARGS(inode, mpd),
> +	TP_ARGS(inode, first_page, wbc),
>  
>  	TP_STRUCT__entry(
>  		__field(	dev_t,	dev			)
>  		__field(	ino_t,	ino			)
> -		__field(	__u64,	b_blocknr		)
> -		__field(	__u32,	b_size			)
> -		__field(	__u32,	b_state			)
> -		__field(	unsigned long,	first_page	)
> -		__field(	int,	io_done			)
> -		__field(	int,	pages_written		)
> -		__field(	int,	sync_mode		)
> +		__field(      pgoff_t,	first_page		)
> +		__field(	 long,	nr_to_write		)
> +		__field(	  int,	sync_mode		)
>  	),
>  
>  	TP_fast_assign(
>  		__entry->dev		= inode->i_sb->s_dev;
>  		__entry->ino		= inode->i_ino;
> -		__entry->b_blocknr	= mpd->b_blocknr;
> -		__entry->b_size		= mpd->b_size;
> -		__entry->b_state	= mpd->b_state;
> -		__entry->first_page	= mpd->first_page;
> -		__entry->io_done	= mpd->io_done;
> -		__entry->pages_written	= mpd->pages_written;
> -		__entry->sync_mode	= mpd->wbc->sync_mode;
> +		__entry->first_page	= first_page;
> +		__entry->nr_to_write	= wbc->nr_to_write;
> +		__entry->sync_mode	= wbc->sync_mode;
>  	),
>  
> -	TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x "
> -		  "first_page %lu io_done %d pages_written %d sync_mode %d",
> +	TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld "
> +		  "sync_mode %d",
>  		  MAJOR(__entry->dev), MINOR(__entry->dev),
> -		  (unsigned long) __entry->ino,
> -		  __entry->b_blocknr, __entry->b_size,
> -		  __entry->b_state, __entry->first_page,
> -		  __entry->io_done, __entry->pages_written,
> -		  __entry->sync_mode
> -                  )
> +		  (unsigned long) __entry->ino, __entry->first_page,
> +		  __entry->nr_to_write, __entry->sync_mode)
> +);
> +
> +TRACE_EVENT(ext4_da_write_pages_extent,
> +	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map),
> +
> +	TP_ARGS(inode, map),
> +
> +	TP_STRUCT__entry(
> +		__field(	dev_t,	dev			)
> +		__field(	ino_t,	ino			)
> +		__field(	__u64,	lblk			)
> +		__field(	__u32,	len			)
> +		__field(	__u32,	flags			)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dev		= inode->i_sb->s_dev;
> +		__entry->ino		= inode->i_ino;
> +		__entry->lblk		= map->m_lblk;
> +		__entry->len		= map->m_len;
> +		__entry->flags		= map->m_flags;
> +	),
> +
> +	TP_printk("dev %d,%d ino %lu lblk %llu len %u flags 0x%04x",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  (unsigned long) __entry->ino, __entry->lblk, __entry->len,
> +		  __entry->flags)
>  );
>  
>  TRACE_EVENT(ext4_da_writepages_result,
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara - May 8, 2013, 11:20 a.m.
On Wed 08-05-13 11:48:57, Zheng Liu wrote:
> On Mon, Apr 08, 2013 at 11:32:23PM +0200, Jan Kara wrote:
> > There are two issues with current writeback path in ext4. For one we
> > don't necessarily map complete pages when blocksize < pagesize and thus
> > needn't do any writeback in one iteration. We always map some blocks
> > though so we will eventually finish mapping the page. Just if writeback
> > races with other operations on the file, forward progress is not really
> > guaranteed. The second problem is that current code structure makes it
> > hard to associate all the bios to some range of pages with one io_end
> > structure so that unwritten extents can be converted after all the bios
> > are finished. This will be especially difficult later when io_end will
> > be associated with reserved transaction handle.
> > 
> > We restructure the writeback path to a relatively simple loop which
> > first prepares extent of pages, then maps one or more extents so that
> > no page is partially mapped, and once page is fully mapped it is
> > submitted for IO. We keep all the mapping and IO submission information
> > in mpage_da_data structure to somewhat reduce stack usage. Resulting
> > code is somewhat shorter than the old one and hopefully also easier to
> > read.
> > 
> > Signed-off-by: Jan Kara <jack@suse.cz>
> 
> One nit below.  Otherwise the patch looks good to be.
> Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
  Thanks for review!

> > ---
> >  fs/ext4/ext4.h              |   15 -
> >  fs/ext4/inode.c             |  978 +++++++++++++++++++++----------------------
> >  include/trace/events/ext4.h |   64 ++-
> >  3 files changed, 508 insertions(+), 549 deletions(-)
> ...
> >  /*
> > - * write_cache_pages_da - walk the list of dirty pages of the given
> > - * address space and accumulate pages that need writing, and call
> > - * mpage_da_map_and_submit to map a single contiguous memory region
> > - * and then write them.
> > + * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
> > + * 				 and underlying extent to map
> > + *
> > + * @mpd - where to look for pages
> > + *
> > + * Walk dirty pages in the mapping while they are contiguous and lock them.
> > + * While pages are fully mapped submit them for IO. When we find a page which
> > + * isn't mapped we start accumulating extent of buffers underlying these pages
> > + * that needs mapping (formed by either delayed or unwritten buffers). The
> > + * extent found is returned in @mpd structure (starting at mpd->lblk with
> > + * length mpd->len blocks).
> >   */
> > -static int write_cache_pages_da(handle_t *handle,
> > -				struct address_space *mapping,
> > -				struct writeback_control *wbc,
> > -				struct mpage_da_data *mpd,
> > -				pgoff_t *done_index)
> > +static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
> >  {
> > -	struct buffer_head	*bh, *head;
> > -	struct inode		*inode = mapping->host;
> > -	struct pagevec		pvec;
> > -	unsigned int		nr_pages;
> > -	sector_t		logical;
> > -	pgoff_t			index, end;
> > -	long			nr_to_write = wbc->nr_to_write;
> > -	int			i, tag, ret = 0;
> > -
> > -	memset(mpd, 0, sizeof(struct mpage_da_data));
> > -	mpd->wbc = wbc;
> > -	mpd->inode = inode;
> > -	pagevec_init(&pvec, 0);
> > -	index = wbc->range_start >> PAGE_CACHE_SHIFT;
> > -	end = wbc->range_end >> PAGE_CACHE_SHIFT;
> > +	struct address_space *mapping = mpd->inode->i_mapping;
> > +	struct pagevec pvec;
> > +	unsigned int nr_pages;
> > +	pgoff_t index = mpd->first_page;
> > +	pgoff_t end = mpd->last_page;
> > +	bool first_page_found = false;
> > +	int tag;
> > +	int i, err = 0;
> > +	int blkbits = mpd->inode->i_blkbits;
> > +	ext4_lblk_t lblk;
> > +	struct buffer_head *head;
> >  
> > -	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
> > +	if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
> >  		tag = PAGECACHE_TAG_TOWRITE;
> >  	else
> >  		tag = PAGECACHE_TAG_DIRTY;
> >  
> > -	*done_index = index;
> > +	mpd->map.m_len = 0;
> > +	mpd->next_page = index;
> 
> Forgot to call pagevec_init(&pvec, 0) here.
  I actually don't think it can cause any problems here (pagevec_lookup()
simply overwrites the pvec and we call pagevec_release() only after
pagevec_lookup()) but it's certainly a good practice so I added it.

								Honza

> >  	while (index <= end) {
> >  		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
> >  			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
> >  		if (nr_pages == 0)
> > -			return 0;
> > +			goto out;
> >  
> >  		for (i = 0; i < nr_pages; i++) {
> >  			struct page *page = pvec.pages[i];
> > @@ -2145,31 +2138,21 @@ static int write_cache_pages_da(handle_t *handle,
> >  			if (page->index > end)
> >  				goto out;
> >  
> > -			*done_index = page->index + 1;
> > -
> > -			/*
> > -			 * If we can't merge this page, and we have
> > -			 * accumulated an contiguous region, write it
> > -			 */
> > -			if ((mpd->next_page != page->index) &&
> > -			    (mpd->next_page != mpd->first_page)) {
> > -				mpage_da_map_and_submit(mpd);
> > -				goto ret_extent_tail;
> > -			}
> > +			/* If we can't merge this page, we are done. */
> > +			if (first_page_found && mpd->next_page != page->index)
> > +				goto out;
> >  
> >  			lock_page(page);
> > -
> >  			/*
> > -			 * If the page is no longer dirty, or its
> > -			 * mapping no longer corresponds to inode we
> > -			 * are writing (which means it has been
> > -			 * truncated or invalidated), or the page is
> > -			 * already under writeback and we are not
> > -			 * doing a data integrity writeback, skip the page
> > +			 * If the page is no longer dirty, or its mapping no
> > +			 * longer corresponds to inode we are writing (which
> > +			 * means it has been truncated or invalidated), or the
> > +			 * page is already under writeback and we are not doing
> > +			 * a data integrity writeback, skip the page
> >  			 */
> >  			if (!PageDirty(page) ||
> >  			    (PageWriteback(page) &&
> > -			     (wbc->sync_mode == WB_SYNC_NONE)) ||
> > +			     (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
> >  			    unlikely(page->mapping != mapping)) {
> >  				unlock_page(page);
> >  				continue;
> > @@ -2178,101 +2161,60 @@ static int write_cache_pages_da(handle_t *handle,
> >  			wait_on_page_writeback(page);
> >  			BUG_ON(PageWriteback(page));
> >  
> > -			/*
> > -			 * If we have inline data and arrive here, it means that
> > -			 * we will soon create the block for the 1st page, so
> > -			 * we'd better clear the inline data here.
> > -			 */
> > -			if (ext4_has_inline_data(inode)) {
> > -				BUG_ON(ext4_test_inode_state(inode,
> > -						EXT4_STATE_MAY_INLINE_DATA));
> > -				ext4_destroy_inline_data(handle, inode);
> > -			}
> > -
> > -			if (mpd->next_page != page->index)
> > +			if (!first_page_found) {
> >  				mpd->first_page = page->index;
> > +				first_page_found = true;
> > +			}
> >  			mpd->next_page = page->index + 1;
> > -			logical = (sector_t) page->index <<
> > -				(PAGE_CACHE_SHIFT - inode->i_blkbits);
> > +			lblk = ((ext4_lblk_t)page->index) <<
> > +				(PAGE_CACHE_SHIFT - blkbits);
> >  
> >  			/* Add all dirty buffers to mpd */
> >  			head = page_buffers(page);
> > -			bh = head;
> > -			do {
> > -				BUG_ON(buffer_locked(bh));
> > -				/*
> > -				 * We need to try to allocate unmapped blocks
> > -				 * in the same page.  Otherwise we won't make
> > -				 * progress with the page in ext4_writepage
> > -				 */
> > -				if (ext4_bh_delay_or_unwritten(NULL, bh)) {
> > -					mpage_add_bh_to_extent(mpd, logical,
> > -							       bh->b_state);
> > -					if (mpd->io_done)
> > -						goto ret_extent_tail;
> > -				} else if (buffer_dirty(bh) &&
> > -					   buffer_mapped(bh)) {
> > -					/*
> > -					 * mapped dirty buffer. We need to
> > -					 * update the b_state because we look
> > -					 * at b_state in mpage_da_map_blocks.
> > -					 * We don't update b_size because if we
> > -					 * find an unmapped buffer_head later
> > -					 * we need to use the b_state flag of
> > -					 * that buffer_head.
> > -					 */
> > -					if (mpd->b_size == 0)
> > -						mpd->b_state =
> > -							bh->b_state & BH_FLAGS;
> > -				}
> > -				logical++;
> > -			} while ((bh = bh->b_this_page) != head);
> > -
> > -			if (nr_to_write > 0) {
> > -				nr_to_write--;
> > -				if (nr_to_write == 0 &&
> > -				    wbc->sync_mode == WB_SYNC_NONE)
> > -					/*
> > -					 * We stop writing back only if we are
> > -					 * not doing integrity sync. In case of
> > -					 * integrity sync we have to keep going
> > -					 * because someone may be concurrently
> > -					 * dirtying pages, and we might have
> > -					 * synced a lot of newly appeared dirty
> > -					 * pages, but have not synced all of the
> > -					 * old dirty pages.
> > -					 */
> > +			if (!add_page_bufs_to_extent(mpd, head, head, lblk))
> > +				goto out;
> > +			/* So far everything mapped? Submit the page for IO. */
> > +			if (mpd->map.m_len == 0) {
> > +				err = mpage_submit_page(mpd, page);
> > +				if (err < 0)
> >  					goto out;
> >  			}
> > +
> > +			/*
> > +			 * Accumulated enough dirty pages? This doesn't apply
> > +			 * to WB_SYNC_ALL mode. For integrity sync we have to
> > +			 * keep going because someone may be concurrently
> > +			 * dirtying pages, and we might have synced a lot of
> > +			 * newly appeared dirty pages, but have not synced all
> > +			 * of the old dirty pages.
> > +			 */
> > +			if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
> > +			    mpd->next_page - mpd->first_page >=
> > +							mpd->wbc->nr_to_write)
> > +				goto out;
> >  		}
> >  		pagevec_release(&pvec);
> >  		cond_resched();
> >  	}
> >  	return 0;
> > -ret_extent_tail:
> > -	ret = MPAGE_DA_EXTENT_TAIL;
> >  out:
> >  	pagevec_release(&pvec);
> > -	cond_resched();
> > -	return ret;
> > +	return err;
> >  }
> >  
> > -
> >  static int ext4_da_writepages(struct address_space *mapping,
> >  			      struct writeback_control *wbc)
> >  {
> > -	pgoff_t	index;
> > +	pgoff_t	writeback_index = 0;
> > +	long nr_to_write = wbc->nr_to_write;
> >  	int range_whole = 0;
> > +	int cycled = 1;
> >  	handle_t *handle = NULL;
> >  	struct mpage_da_data mpd;
> >  	struct inode *inode = mapping->host;
> > -	int pages_written = 0;
> > -	int range_cyclic, cycled = 1, io_done = 0;
> >  	int needed_blocks, ret = 0;
> > -	loff_t range_start = wbc->range_start;
> >  	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
> > -	pgoff_t done_index = 0;
> > -	pgoff_t end;
> > +	bool done;
> >  	struct blk_plug plug;
> >  
> >  	trace_ext4_da_writepages(inode, wbc);
> > @@ -2298,40 +2240,65 @@ static int ext4_da_writepages(struct address_space *mapping,
> >  	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
> >  		return -EROFS;
> >  
> > +	/*
> > +	 * If we have inline data and arrive here, it means that
> > +	 * we will soon create the block for the 1st page, so
> > +	 * we'd better clear the inline data here.
> > +	 */
> > +	if (ext4_has_inline_data(inode)) {
> > +		/* Just inode will be modified... */
> > +		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
> > +		if (IS_ERR(handle)) {
> > +			ret = PTR_ERR(handle);
> > +			goto out_writepages;
> > +		}
> > +		BUG_ON(ext4_test_inode_state(inode,
> > +				EXT4_STATE_MAY_INLINE_DATA));
> > +		ext4_destroy_inline_data(handle, inode);
> > +		ext4_journal_stop(handle);
> > +	}
> > +
> >  	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
> >  		range_whole = 1;
> >  
> > -	range_cyclic = wbc->range_cyclic;
> >  	if (wbc->range_cyclic) {
> > -		index = mapping->writeback_index;
> > -		if (index)
> > +		writeback_index = mapping->writeback_index;
> > +		if (writeback_index)
> >  			cycled = 0;
> > -		wbc->range_start = index << PAGE_CACHE_SHIFT;
> > -		wbc->range_end  = LLONG_MAX;
> > -		wbc->range_cyclic = 0;
> > -		end = -1;
> > +		mpd.first_page = writeback_index;
> > +		mpd.last_page = -1;
> >  	} else {
> > -		index = wbc->range_start >> PAGE_CACHE_SHIFT;
> > -		end = wbc->range_end >> PAGE_CACHE_SHIFT;
> > +		mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
> > +		mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
> >  	}
> >  
> > +	mpd.inode = inode;
> > +	mpd.wbc = wbc;
> > +	ext4_io_submit_init(&mpd.io_submit, wbc);
> >  retry:
> >  	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
> > -		tag_pages_for_writeback(mapping, index, end);
> > -
> > +		tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
> > +	done = false;
> >  	blk_start_plug(&plug);
> > -	while (!ret && wbc->nr_to_write > 0) {
> > +	while (!done && mpd.first_page <= mpd.last_page) {
> > +		/* For each extent of pages we use new io_end */
> > +		mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
> > +		if (!mpd.io_submit.io_end) {
> > +			ret = -ENOMEM;
> > +			break;
> > +		}
> >  
> >  		/*
> > -		 * we  insert one extent at a time. So we need
> > -		 * credit needed for single extent allocation.
> > -		 * journalled mode is currently not supported
> > -		 * by delalloc
> > +		 * We have two constraints: We find one extent to map and we
> > +		 * must always write out whole page (makes a difference when
> > +		 * blocksize < pagesize) so that we don't block on IO when we
> > +		 * try to write out the rest of the page. Journalled mode is
> > +		 * not supported by delalloc.
> >  		 */
> >  		BUG_ON(ext4_should_journal_data(inode));
> >  		needed_blocks = ext4_da_writepages_trans_blocks(inode);
> >  
> > -		/* start a new transaction*/
> > +		/* start a new transaction */
> >  		handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
> >  					    needed_blocks);
> >  		if (IS_ERR(handle)) {
> > @@ -2339,76 +2306,67 @@ retry:
> >  			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
> >  			       "%ld pages, ino %lu; err %d", __func__,
> >  				wbc->nr_to_write, inode->i_ino, ret);
> > -			blk_finish_plug(&plug);
> > -			goto out_writepages;
> > +			/* Release allocated io_end */
> > +			ext4_put_io_end(mpd.io_submit.io_end);
> > +			break;
> >  		}
> >  
> > -		/*
> > -		 * Now call write_cache_pages_da() to find the next
> > -		 * contiguous region of logical blocks that need
> > -		 * blocks to be allocated by ext4 and submit them.
> > -		 */
> > -		ret = write_cache_pages_da(handle, mapping,
> > -					   wbc, &mpd, &done_index);
> > -		/*
> > -		 * If we have a contiguous extent of pages and we
> > -		 * haven't done the I/O yet, map the blocks and submit
> > -		 * them for I/O.
> > -		 */
> > -		if (!mpd.io_done && mpd.next_page != mpd.first_page) {
> > -			mpage_da_map_and_submit(&mpd);
> > -			ret = MPAGE_DA_EXTENT_TAIL;
> > +		trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
> > +		ret = mpage_prepare_extent_to_map(&mpd);
> > +		if (!ret) {
> > +			if (mpd.map.m_len)
> > +				ret = mpage_map_and_submit_extent(handle, &mpd);
> > +			else {
> > +				/*
> > +				 * We scanned the whole range (or exhausted
> > +				 * nr_to_write), submitted what was mapped and
> > +				 * didn't find anything needing mapping. We are
> > +				 * done.
> > +				 */
> > +				done = true;
> > +			}
> >  		}
> > -		trace_ext4_da_write_pages(inode, &mpd);
> > -		wbc->nr_to_write -= mpd.pages_written;
> > -
> >  		ext4_journal_stop(handle);
> > -
> > -		if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
> > -			/* commit the transaction which would
> > +		/* Submit prepared bio */
> > +		ext4_io_submit(&mpd.io_submit);
> > +		/* Unlock pages we didn't use */
> > +		mpage_release_unused_pages(&mpd, false);
> > +		/* Drop our io_end reference we got from init */
> > +		ext4_put_io_end(mpd.io_submit.io_end);
> > +
> > +		if (ret == -ENOSPC && sbi->s_journal) {
> > +			/*
> > +			 * Commit the transaction which would
> >  			 * free blocks released in the transaction
> >  			 * and try again
> >  			 */
> >  			jbd2_journal_force_commit_nested(sbi->s_journal);
> >  			ret = 0;
> > -		} else if (ret == MPAGE_DA_EXTENT_TAIL) {
> > -			/*
> > -			 * Got one extent now try with rest of the pages.
> > -			 * If mpd.retval is set -EIO, journal is aborted.
> > -			 * So we don't need to write any more.
> > -			 */
> > -			pages_written += mpd.pages_written;
> > -			ret = mpd.retval;
> > -			io_done = 1;
> > -		} else if (wbc->nr_to_write)
> > -			/*
> > -			 * There is no more writeout needed
> > -			 * or we requested for a noblocking writeout
> > -			 * and we found the device congested
> > -			 */
> > +			continue;
> > +		}
> > +		/* Fatal error - ENOMEM, EIO... */
> > +		if (ret)
> >  			break;
> >  	}
> >  	blk_finish_plug(&plug);
> > -	if (!io_done && !cycled) {
> > +	if (!ret && !cycled) {
> >  		cycled = 1;
> > -		index = 0;
> > -		wbc->range_start = index << PAGE_CACHE_SHIFT;
> > -		wbc->range_end  = mapping->writeback_index - 1;
> > +		mpd.last_page = writeback_index - 1;
> > +		mpd.first_page = 0;
> >  		goto retry;
> >  	}
> >  
> >  	/* Update index */
> > -	wbc->range_cyclic = range_cyclic;
> >  	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
> >  		/*
> > -		 * set the writeback_index so that range_cyclic
> > +		 * Set the writeback_index so that range_cyclic
> >  		 * mode will write it back later
> >  		 */
> > -		mapping->writeback_index = done_index;
> > +		mapping->writeback_index = mpd.first_page;
> >  
> >  out_writepages:
> > -	wbc->range_start = range_start;
> > -	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
> > +	trace_ext4_da_writepages_result(inode, wbc, ret,
> > +					nr_to_write - wbc->nr_to_write);
> >  	return ret;
> >  }
> >  
> > diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
> > index a601bb3..203dcd5 100644
> > --- a/include/trace/events/ext4.h
> > +++ b/include/trace/events/ext4.h
> > @@ -332,43 +332,59 @@ TRACE_EVENT(ext4_da_writepages,
> >  );
> >  
> >  TRACE_EVENT(ext4_da_write_pages,
> > -	TP_PROTO(struct inode *inode, struct mpage_da_data *mpd),
> > +	TP_PROTO(struct inode *inode, pgoff_t first_page,
> > +		 struct writeback_control *wbc),
> >  
> > -	TP_ARGS(inode, mpd),
> > +	TP_ARGS(inode, first_page, wbc),
> >  
> >  	TP_STRUCT__entry(
> >  		__field(	dev_t,	dev			)
> >  		__field(	ino_t,	ino			)
> > -		__field(	__u64,	b_blocknr		)
> > -		__field(	__u32,	b_size			)
> > -		__field(	__u32,	b_state			)
> > -		__field(	unsigned long,	first_page	)
> > -		__field(	int,	io_done			)
> > -		__field(	int,	pages_written		)
> > -		__field(	int,	sync_mode		)
> > +		__field(      pgoff_t,	first_page		)
> > +		__field(	 long,	nr_to_write		)
> > +		__field(	  int,	sync_mode		)
> >  	),
> >  
> >  	TP_fast_assign(
> >  		__entry->dev		= inode->i_sb->s_dev;
> >  		__entry->ino		= inode->i_ino;
> > -		__entry->b_blocknr	= mpd->b_blocknr;
> > -		__entry->b_size		= mpd->b_size;
> > -		__entry->b_state	= mpd->b_state;
> > -		__entry->first_page	= mpd->first_page;
> > -		__entry->io_done	= mpd->io_done;
> > -		__entry->pages_written	= mpd->pages_written;
> > -		__entry->sync_mode	= mpd->wbc->sync_mode;
> > +		__entry->first_page	= first_page;
> > +		__entry->nr_to_write	= wbc->nr_to_write;
> > +		__entry->sync_mode	= wbc->sync_mode;
> >  	),
> >  
> > -	TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x "
> > -		  "first_page %lu io_done %d pages_written %d sync_mode %d",
> > +	TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld "
> > +		  "sync_mode %d",
> >  		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > -		  (unsigned long) __entry->ino,
> > -		  __entry->b_blocknr, __entry->b_size,
> > -		  __entry->b_state, __entry->first_page,
> > -		  __entry->io_done, __entry->pages_written,
> > -		  __entry->sync_mode
> > -                  )
> > +		  (unsigned long) __entry->ino, __entry->first_page,
> > +		  __entry->nr_to_write, __entry->sync_mode)
> > +);
> > +
> > +TRACE_EVENT(ext4_da_write_pages_extent,
> > +	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map),
> > +
> > +	TP_ARGS(inode, map),
> > +
> > +	TP_STRUCT__entry(
> > +		__field(	dev_t,	dev			)
> > +		__field(	ino_t,	ino			)
> > +		__field(	__u64,	lblk			)
> > +		__field(	__u32,	len			)
> > +		__field(	__u32,	flags			)
> > +	),
> > +
> > +	TP_fast_assign(
> > +		__entry->dev		= inode->i_sb->s_dev;
> > +		__entry->ino		= inode->i_ino;
> > +		__entry->lblk		= map->m_lblk;
> > +		__entry->len		= map->m_len;
> > +		__entry->flags		= map->m_flags;
> > +	),
> > +
> > +	TP_printk("dev %d,%d ino %lu lblk %llu len %u flags 0x%04x",
> > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > +		  (unsigned long) __entry->ino, __entry->lblk, __entry->len,
> > +		  __entry->flags)
> >  );
> >  
> >  TRACE_EVENT(ext4_da_writepages_result,
> > -- 
> > 1.7.1
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a6f7331..cb1ba1c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -175,21 +175,6 @@  struct ext4_map_blocks {
 };
 
 /*
- * For delayed allocation tracking
- */
-struct mpage_da_data {
-	struct inode *inode;
-	sector_t b_blocknr;		/* start block number of extent */
-	size_t b_size;			/* size of extent */
-	unsigned long b_state;		/* state of the extent */
-	unsigned long first_page, next_page;	/* extent of pages */
-	struct writeback_control *wbc;
-	int io_done;
-	int pages_written;
-	int retval;
-};
-
-/*
  * Flags for ext4_io_end->flags
  */
 #define	EXT4_IO_END_UNWRITTEN	0x0001
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9cb4e75..5c191a3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1324,145 +1324,42 @@  static void ext4_da_page_release_reservation(struct page *page,
  * Delayed allocation stuff
  */
 
-/*
- * mpage_da_submit_io - walks through extent of pages and try to write
- * them with writepage() call back
- *
- * @mpd->inode: inode
- * @mpd->first_page: first page of the extent
- * @mpd->next_page: page after the last page of the extent
- *
- * By the time mpage_da_submit_io() is called we expect all blocks
- * to be allocated. this may be wrong if allocation failed.
- *
- * As pages are already locked by write_cache_pages(), we can't use it
- */
-static int mpage_da_submit_io(struct mpage_da_data *mpd,
-			      struct ext4_map_blocks *map)
-{
-	struct pagevec pvec;
-	unsigned long index, end;
-	int ret = 0, err, nr_pages, i;
-	struct inode *inode = mpd->inode;
-	struct address_space *mapping = inode->i_mapping;
-	loff_t size = i_size_read(inode);
-	unsigned int len, block_start;
-	struct buffer_head *bh, *page_bufs = NULL;
-	sector_t pblock = 0, cur_logical = 0;
-	struct ext4_io_submit io_submit;
-
-	BUG_ON(mpd->next_page <= mpd->first_page);
-	ext4_io_submit_init(&io_submit, mpd->wbc);
-	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
-	if (!io_submit.io_end)
-		return -ENOMEM;
+struct mpage_da_data {
+	struct inode *inode;
+	struct writeback_control *wbc;
+	pgoff_t first_page;	/* The first page to write */
+	pgoff_t next_page;	/* Current page to examine */
+	pgoff_t last_page;	/* Last page to examine */
 	/*
-	 * We need to start from the first_page to the next_page - 1
-	 * to make sure we also write the mapped dirty buffer_heads.
-	 * If we look at mpd->b_blocknr we would only be looking
-	 * at the currently mapped buffer_heads.
+	 * Extent to map - this can be after first_page because that can be
+	 * fully mapped. We somewhat abuse m_flags to store whether the extent
+	 * is delalloc or unwritten.
 	 */
-	index = mpd->first_page;
-	end = mpd->next_page - 1;
-
-	pagevec_init(&pvec, 0);
-	while (index <= end) {
-		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-		if (nr_pages == 0)
-			break;
-		for (i = 0; i < nr_pages; i++) {
-			int skip_page = 0;
-			struct page *page = pvec.pages[i];
-
-			index = page->index;
-			if (index > end)
-				break;
-
-			if (index == size >> PAGE_CACHE_SHIFT)
-				len = size & ~PAGE_CACHE_MASK;
-			else
-				len = PAGE_CACHE_SIZE;
-			if (map) {
-				cur_logical = index << (PAGE_CACHE_SHIFT -
-							inode->i_blkbits);
-				pblock = map->m_pblk + (cur_logical -
-							map->m_lblk);
-			}
-			index++;
-
-			BUG_ON(!PageLocked(page));
-			BUG_ON(PageWriteback(page));
-
-			bh = page_bufs = page_buffers(page);
-			block_start = 0;
-			do {
-				if (map && (cur_logical >= map->m_lblk) &&
-				    (cur_logical <= (map->m_lblk +
-						     (map->m_len - 1)))) {
-					if (buffer_delay(bh)) {
-						clear_buffer_delay(bh);
-						bh->b_blocknr = pblock;
-					}
-					if (buffer_unwritten(bh) ||
-					    buffer_mapped(bh))
-						BUG_ON(bh->b_blocknr != pblock);
-					if (map->m_flags & EXT4_MAP_UNINIT)
-						set_buffer_uninit(bh);
-					clear_buffer_unwritten(bh);
-				}
-
-				/*
-				 * skip page if block allocation undone and
-				 * block is dirty
-				 */
-				if (ext4_bh_delay_or_unwritten(NULL, bh))
-					skip_page = 1;
-				bh = bh->b_this_page;
-				block_start += bh->b_size;
-				cur_logical++;
-				pblock++;
-			} while (bh != page_bufs);
-
-			if (skip_page) {
-				unlock_page(page);
-				continue;
-			}
-
-			clear_page_dirty_for_io(page);
-			err = ext4_bio_write_page(&io_submit, page, len,
-						  mpd->wbc);
-			if (!err)
-				mpd->pages_written++;
-			/*
-			 * In error case, we have to continue because
-			 * remaining pages are still locked
-			 */
-			if (ret == 0)
-				ret = err;
-		}
-		pagevec_release(&pvec);
-	}
-	ext4_io_submit(&io_submit);
-	/* Drop io_end reference we got from init */
-	ext4_put_io_end_defer(io_submit.io_end);
-	return ret;
-}
+	struct ext4_map_blocks map;
+	struct ext4_io_submit io_submit;	/* IO submission data */
+};
 
-static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
+static void mpage_release_unused_pages(struct mpage_da_data *mpd,
+				       bool invalidate)
 {
 	int nr_pages, i;
 	pgoff_t index, end;
 	struct pagevec pvec;
 	struct inode *inode = mpd->inode;
 	struct address_space *mapping = inode->i_mapping;
-	ext4_lblk_t start, last;
+
+	/* This is necessary when next_page == 0. */
+	if (mpd->first_page >= mpd->next_page)
+		return;
 
 	index = mpd->first_page;
 	end   = mpd->next_page - 1;
-
-	start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	ext4_es_remove_extent(inode, start, last - start + 1);
+	if (invalidate) {
+		ext4_lblk_t start, last;
+		start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		ext4_es_remove_extent(inode, start, last - start + 1);
+	}
 
 	pagevec_init(&pvec, 0);
 	while (index <= end) {
@@ -1475,14 +1372,15 @@  static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
 				break;
 			BUG_ON(!PageLocked(page));
 			BUG_ON(PageWriteback(page));
-			block_invalidatepage(page, 0);
-			ClearPageUptodate(page);
+			if (invalidate) {
+				block_invalidatepage(page, 0);
+				ClearPageUptodate(page);
+			}
 			unlock_page(page);
 		}
 		index = pvec.pages[nr_pages - 1]->index + 1;
 		pagevec_release(&pvec);
 	}
-	return;
 }
 
 static void ext4_print_free_blocks(struct inode *inode)
@@ -1508,206 +1406,6 @@  static void ext4_print_free_blocks(struct inode *inode)
 	return;
 }
 
-/*
- * mpage_da_map_and_submit - go through given space, map them
- *       if necessary, and then submit them for I/O
- *
- * @mpd - bh describing space
- *
- * The function skips space we know is already mapped to disk blocks.
- *
- */
-static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
-{
-	int err, blks, get_blocks_flags;
-	struct ext4_map_blocks map, *mapp = NULL;
-	sector_t next = mpd->b_blocknr;
-	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
-	loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
-	handle_t *handle = NULL;
-
-	/*
-	 * If the blocks are mapped already, or we couldn't accumulate
-	 * any blocks, then proceed immediately to the submission stage.
-	 */
-	if ((mpd->b_size == 0) ||
-	    ((mpd->b_state  & (1 << BH_Mapped)) &&
-	     !(mpd->b_state & (1 << BH_Delay)) &&
-	     !(mpd->b_state & (1 << BH_Unwritten))))
-		goto submit_io;
-
-	handle = ext4_journal_current_handle();
-	BUG_ON(!handle);
-
-	/*
-	 * Call ext4_map_blocks() to allocate any delayed allocation
-	 * blocks, or to convert an uninitialized extent to be
-	 * initialized (in the case where we have written into
-	 * one or more preallocated blocks).
-	 *
-	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
-	 * indicate that we are on the delayed allocation path.  This
-	 * affects functions in many different parts of the allocation
-	 * call path.  This flag exists primarily because we don't
-	 * want to change *many* call functions, so ext4_map_blocks()
-	 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
-	 * inode's allocation semaphore is taken.
-	 *
-	 * If the blocks in questions were delalloc blocks, set
-	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
-	 * variables are updated after the blocks have been allocated.
-	 */
-	map.m_lblk = next;
-	map.m_len = max_blocks;
-	get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
-	if (ext4_should_dioread_nolock(mpd->inode))
-		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
-	if (mpd->b_state & (1 << BH_Delay))
-		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
-
-	blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
-	if (blks < 0) {
-		struct super_block *sb = mpd->inode->i_sb;
-
-		err = blks;
-		/*
-		 * If get block returns EAGAIN or ENOSPC and there
-		 * appears to be free blocks we will just let
-		 * mpage_da_submit_io() unlock all of the pages.
-		 */
-		if (err == -EAGAIN)
-			goto submit_io;
-
-		if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
-			mpd->retval = err;
-			goto submit_io;
-		}
-
-		/*
-		 * get block failure will cause us to loop in
-		 * writepages, because a_ops->writepage won't be able
-		 * to make progress. The page will be redirtied by
-		 * writepage and writepages will again try to write
-		 * the same.
-		 */
-		if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
-			ext4_msg(sb, KERN_CRIT,
-				 "delayed block allocation failed for inode %lu "
-				 "at logical offset %llu with max blocks %zd "
-				 "with error %d", mpd->inode->i_ino,
-				 (unsigned long long) next,
-				 mpd->b_size >> mpd->inode->i_blkbits, err);
-			ext4_msg(sb, KERN_CRIT,
-				"This should not happen!! Data will be lost");
-			if (err == -ENOSPC)
-				ext4_print_free_blocks(mpd->inode);
-		}
-		/* invalidate all the pages */
-		ext4_da_block_invalidatepages(mpd);
-
-		/* Mark this page range as having been completed */
-		mpd->io_done = 1;
-		return;
-	}
-	BUG_ON(blks == 0);
-
-	mapp = &map;
-	if (map.m_flags & EXT4_MAP_NEW) {
-		struct block_device *bdev = mpd->inode->i_sb->s_bdev;
-		int i;
-
-		for (i = 0; i < map.m_len; i++)
-			unmap_underlying_metadata(bdev, map.m_pblk + i);
-	}
-
-	/*
-	 * Update on-disk size along with block allocation.
-	 */
-	disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
-	if (disksize > i_size_read(mpd->inode))
-		disksize = i_size_read(mpd->inode);
-	if (disksize > EXT4_I(mpd->inode)->i_disksize) {
-		ext4_update_i_disksize(mpd->inode, disksize);
-		err = ext4_mark_inode_dirty(handle, mpd->inode);
-		if (err)
-			ext4_error(mpd->inode->i_sb,
-				   "Failed to mark inode %lu dirty",
-				   mpd->inode->i_ino);
-	}
-
-submit_io:
-	mpage_da_submit_io(mpd, mapp);
-	mpd->io_done = 1;
-}
-
-#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
-		(1 << BH_Delay) | (1 << BH_Unwritten))
-
-/*
- * mpage_add_bh_to_extent - try to add one more block to extent of blocks
- *
- * @mpd->lbh - extent of blocks
- * @logical - logical number of the block in the file
- * @b_state - b_state of the buffer head added
- *
- * the function is used to collect contig. blocks in same state
- */
-static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
-				   unsigned long b_state)
-{
-	sector_t next;
-	int blkbits = mpd->inode->i_blkbits;
-	int nrblocks = mpd->b_size >> blkbits;
-
-	/*
-	 * XXX Don't go larger than mballoc is willing to allocate
-	 * This is a stopgap solution.  We eventually need to fold
-	 * mpage_da_submit_io() into this function and then call
-	 * ext4_map_blocks() multiple times in a loop
-	 */
-	if (nrblocks >= (8*1024*1024 >> blkbits))
-		goto flush_it;
-
-	/* check if the reserved journal credits might overflow */
-	if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
-		if (nrblocks >= EXT4_MAX_TRANS_DATA) {
-			/*
-			 * With non-extent format we are limited by the journal
-			 * credit available.  Total credit needed to insert
-			 * nrblocks contiguous blocks is dependent on the
-			 * nrblocks.  So limit nrblocks.
-			 */
-			goto flush_it;
-		}
-	}
-	/*
-	 * First block in the extent
-	 */
-	if (mpd->b_size == 0) {
-		mpd->b_blocknr = logical;
-		mpd->b_size = 1 << blkbits;
-		mpd->b_state = b_state & BH_FLAGS;
-		return;
-	}
-
-	next = mpd->b_blocknr + nrblocks;
-	/*
-	 * Can we merge the block to our big extent?
-	 */
-	if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
-		mpd->b_size += 1 << blkbits;
-		return;
-	}
-
-flush_it:
-	/*
-	 * We couldn't merge the block to our extent, so we
-	 * need to flush current  extent and start new one
-	 */
-	mpage_da_map_and_submit(mpd);
-	return;
-}
-
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 {
 	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
@@ -2077,6 +1775,301 @@  static int ext4_writepage(struct page *page,
 	return ret;
 }
 
+#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
+
+/*
+ * mballoc gives us at most this number of blocks...
+ * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
+ * The rest of mballoc seems to handle chunks upto full group size.
+ */
+#define MAX_WRITEPAGES_EXTENT_LEN 2048
+
+/*
+ * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
+ *
+ * @mpd - extent of blocks
+ * @lblk - logical number of the block in the file
+ * @b_state - b_state of the buffer head added
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+				  unsigned long b_state)
+{
+	struct ext4_map_blocks *map = &mpd->map;
+
+	/* Don't go larger than mballoc is willing to allocate */
+	if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+		return 0;
+
+	/* First block in the extent? */
+	if (map->m_len == 0) {
+		map->m_lblk = lblk;
+		map->m_len = 1;
+		map->m_flags = b_state & BH_FLAGS;
+		return 1;
+	}
+
+	/* Can we merge the block to our big extent? */
+	if (lblk == map->m_lblk + map->m_len &&
+	    (b_state & BH_FLAGS) == map->m_flags) {
+		map->m_len++;
+		return 1;
+	}
+	return 0;
+}
+
+static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
+				    struct buffer_head *head,
+				    struct buffer_head *bh,
+				    ext4_lblk_t lblk)
+{
+	do {
+		BUG_ON(buffer_locked(bh));
+
+		if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+		    (!buffer_delay(bh) && !buffer_unwritten(bh))) {
+			/* Found extent to map? */
+			if (mpd->map.m_len)
+				return false;
+			continue;
+		}
+		if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
+			return false;
+	} while (lblk++, (bh = bh->b_this_page) != head);
+	return true;
+}
+
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+{
+	int len;
+	loff_t size = i_size_read(mpd->inode);
+	int err;
+
+	BUG_ON(page->index != mpd->first_page);
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+	clear_page_dirty_for_io(page);
+	err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
+	if (!err)
+		mpd->wbc->nr_to_write--;
+	mpd->first_page++;
+
+	return err;
+}
+
+/*
+ * mpage_map_buffers - update buffers corresponding to changed extent and
+ *		       submit fully mapped pages for IO
+ *
+ * @mpd - description of extent to map, on return next extent to map
+ *
+ * Scan buffers corresponding to changed extent (we expect corresponding pages
+ * to be already locked) and update buffer state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits,
+ * and mark buffers as uninit when we perform writes to uninitialized extents
+ * and do extent conversion after IO is finished. If the last page is not fully
+ * mapped, we update @map to the next extent in the last page that needs
+ * mapping. Otherwise we submit the page for IO.
+ */
+static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
+{
+	struct pagevec pvec;
+	int nr_pages, i;
+	struct inode *inode = mpd->inode;
+	struct buffer_head *head, *bh;
+	int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+	pgoff_t start, end;
+	ext4_lblk_t lblk;
+	sector_t pblock;
+	int err;
+
+	start = mpd->map.m_lblk >> bpp_bits;
+	end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
+	lblk = start << bpp_bits;
+	pblock = mpd->map.m_pblk;
+
+	pagevec_init(&pvec, 0);
+	while (start <= end) {
+		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+					  PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			if (page->index > end)
+				break;
+			/* Upto 'end' pages must be contiguous */
+			BUG_ON(page->index != start);
+			bh = head = page_buffers(page);
+			do {
+				if (lblk < mpd->map.m_lblk)
+					continue;
+				if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+					/*
+					 * Buffer after end of mapped extent.
+					 * Find next buffer in the page to map.
+					 */
+					mpd->map.m_len = 0;
+					mpd->map.m_flags = 0;
+					add_page_bufs_to_extent(mpd, head, bh,
+								lblk);
+					pagevec_release(&pvec);
+					return 0;
+				}
+				if (buffer_delay(bh)) {
+					clear_buffer_delay(bh);
+					bh->b_blocknr = pblock++;
+				}
+				if (mpd->map.m_flags & EXT4_MAP_UNINIT)
+					set_buffer_uninit(bh);
+				clear_buffer_unwritten(bh);
+			} while (lblk++, (bh = bh->b_this_page) != head);
+
+			/* Page fully mapped - let IO run! */
+			err = mpage_submit_page(mpd, page);
+			if (err < 0) {
+				pagevec_release(&pvec);
+				return err;
+			}
+			start++;
+		}
+		pagevec_release(&pvec);
+	}
+	/* Extent fully mapped and matches with page boundary. We are done. */
+	mpd->map.m_len = 0;
+	mpd->map.m_flags = 0;
+	return 0;
+}
+
+static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
+{
+	struct inode *inode = mpd->inode;
+	struct ext4_map_blocks *map = &mpd->map;
+	int get_blocks_flags;
+	int err;
+
+	trace_ext4_da_write_pages_extent(inode, map);
+	/*
+	 * Call ext4_map_blocks() to allocate any delayed allocation
+	 * blocks, or to convert an uninitialized extent to be
+	 * initialized (in the case where we have written into
+	 * one or more preallocated blocks).
+	 *
+	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
+	 * in question are delalloc blocks.  This affects functions in many
+	 * different parts of the allocation call path.  This flag exists
+	 * primarily because we don't want to change *many* call functions, so
+	 * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
+	 * once the inode's allocation semaphore is taken.
+	 */
+	get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
+	if (ext4_should_dioread_nolock(inode))
+		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+	if (map->m_flags & (1 << BH_Delay))
+		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
+	err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
+	if (err < 0)
+		return err;
+
+	BUG_ON(map->m_len == 0);
+	if (map->m_flags & EXT4_MAP_NEW) {
+		struct block_device *bdev = inode->i_sb->s_bdev;
+		int i;
+
+		for (i = 0; i < map->m_len; i++)
+			unmap_underlying_metadata(bdev, map->m_pblk + i);
+	}
+	return 0;
+}
+
+/*
+ * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
+ *				 mpd->len and submit pages underlying it for IO
+ *
+ * @handle - handle for journal operations
+ * @mpd - extent to map
+ *
+ * The function maps extent starting at mpd->lblk of length mpd->len. If it is
+ * delayed, blocks are allocated, if it is unwritten, we may need to convert
+ * them to initialized or split the described range from larger unwritten
+ * extent. Note that we need not map all the described range since allocation
+ * can return less blocks or the range is covered by more unwritten extents. We
+ * cannot map more because we are limited by reserved transaction credits. On
+ * the other hand we always make sure that the last touched page is fully
+ * mapped so that it can be written out (and thus forward progress is
+ * guaranteed). After mapping we submit all mapped pages for IO.
+ */
+static int mpage_map_and_submit_extent(handle_t *handle,
+				       struct mpage_da_data *mpd)
+{
+	struct inode *inode = mpd->inode;
+	struct ext4_map_blocks *map = &mpd->map;
+	int err;
+	loff_t disksize;
+
+	while (map->m_len) {
+		err = mpage_map_one_extent(handle, mpd);
+		if (err < 0) {
+			struct super_block *sb = inode->i_sb;
+
+			/*
+			 * Need to commit transaction to free blocks. Let upper
+			 * layers sort it out.
+			 */
+			if (err == -ENOSPC && ext4_count_free_clusters(sb))
+				return -ENOSPC;
+
+			if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+				ext4_msg(sb, KERN_CRIT,
+					 "Delayed block allocation failed for "
+					 "inode %lu at logical offset %llu with"
+					 " max blocks %u with error %d",
+					 inode->i_ino,
+					 (unsigned long long)map->m_lblk,
+					 (unsigned)map->m_len, err);
+				ext4_msg(sb, KERN_CRIT,
+					 "This should not happen!! Data will "
+					 "be lost\n");
+				if (err == -ENOSPC)
+					ext4_print_free_blocks(inode);
+			}
+			/* invalidate all the pages */
+			mpage_release_unused_pages(mpd, true);
+			return err;
+		}
+		/*
+		 * Update buffer state, submit mapped pages, and get us new
+		 * extent to map
+		 */
+		err = mpage_map_and_submit_buffers(mpd);
+		if (err < 0)
+			return err;
+	}
+
+	/* Update on-disk size after IO is submitted */
+	disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+	if (disksize > i_size_read(inode))
+		disksize = i_size_read(inode);
+	if (disksize > EXT4_I(inode)->i_disksize) {
+		int err2;
+
+		ext4_update_i_disksize(inode, disksize);
+		err2 = ext4_mark_inode_dirty(handle, inode);
+		if (err2)
+			ext4_error(inode->i_sb,
+				   "Failed to mark inode %lu dirty",
+				   inode->i_ino);
+		if (!err)
+			err = err2;
+	}
+	return err;
+}
+
 /*
  * Calculate the total number of credits to reserve for one writepages
  * iteration. This is called from ext4_da_writepages(). We map an extent of
@@ -2093,44 +2086,44 @@  static int ext4_da_writepages_trans_blocks(struct inode *inode)
 }
 
 /*
- * write_cache_pages_da - walk the list of dirty pages of the given
- * address space and accumulate pages that need writing, and call
- * mpage_da_map_and_submit to map a single contiguous memory region
- * and then write them.
+ * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
+ * 				 and underlying extent to map
+ *
+ * @mpd - where to look for pages
+ *
+ * Walk dirty pages in the mapping while they are contiguous and lock them.
+ * While pages are fully mapped submit them for IO. When we find a page which
+ * isn't mapped we start accumulating extent of buffers underlying these pages
+ * that needs mapping (formed by either delayed or unwritten buffers). The
+ * extent found is returned in @mpd structure (starting at mpd->lblk with
+ * length mpd->len blocks).
  */
-static int write_cache_pages_da(handle_t *handle,
-				struct address_space *mapping,
-				struct writeback_control *wbc,
-				struct mpage_da_data *mpd,
-				pgoff_t *done_index)
+static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 {
-	struct buffer_head	*bh, *head;
-	struct inode		*inode = mapping->host;
-	struct pagevec		pvec;
-	unsigned int		nr_pages;
-	sector_t		logical;
-	pgoff_t			index, end;
-	long			nr_to_write = wbc->nr_to_write;
-	int			i, tag, ret = 0;
-
-	memset(mpd, 0, sizeof(struct mpage_da_data));
-	mpd->wbc = wbc;
-	mpd->inode = inode;
-	pagevec_init(&pvec, 0);
-	index = wbc->range_start >> PAGE_CACHE_SHIFT;
-	end = wbc->range_end >> PAGE_CACHE_SHIFT;
+	struct address_space *mapping = mpd->inode->i_mapping;
+	struct pagevec pvec;
+	unsigned int nr_pages;
+	pgoff_t index = mpd->first_page;
+	pgoff_t end = mpd->last_page;
+	bool first_page_found = false;
+	int tag;
+	int i, err = 0;
+	int blkbits = mpd->inode->i_blkbits;
+	ext4_lblk_t lblk;
+	struct buffer_head *head;
 
-	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+	if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 
-	*done_index = index;
+	mpd->map.m_len = 0;
+	mpd->next_page = index;
 	while (index <= end) {
 		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
 			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
 		if (nr_pages == 0)
-			return 0;
+			goto out;
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
@@ -2145,31 +2138,21 @@  static int write_cache_pages_da(handle_t *handle,
 			if (page->index > end)
 				goto out;
 
-			*done_index = page->index + 1;
-
-			/*
-			 * If we can't merge this page, and we have
-			 * accumulated an contiguous region, write it
-			 */
-			if ((mpd->next_page != page->index) &&
-			    (mpd->next_page != mpd->first_page)) {
-				mpage_da_map_and_submit(mpd);
-				goto ret_extent_tail;
-			}
+			/* If we can't merge this page, we are done. */
+			if (first_page_found && mpd->next_page != page->index)
+				goto out;
 
 			lock_page(page);
-
 			/*
-			 * If the page is no longer dirty, or its
-			 * mapping no longer corresponds to inode we
-			 * are writing (which means it has been
-			 * truncated or invalidated), or the page is
-			 * already under writeback and we are not
-			 * doing a data integrity writeback, skip the page
+			 * If the page is no longer dirty, or its mapping no
+			 * longer corresponds to inode we are writing (which
+			 * means it has been truncated or invalidated), or the
+			 * page is already under writeback and we are not doing
+			 * a data integrity writeback, skip the page
 			 */
 			if (!PageDirty(page) ||
 			    (PageWriteback(page) &&
-			     (wbc->sync_mode == WB_SYNC_NONE)) ||
+			     (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
 			    unlikely(page->mapping != mapping)) {
 				unlock_page(page);
 				continue;
@@ -2178,101 +2161,60 @@  static int write_cache_pages_da(handle_t *handle,
 			wait_on_page_writeback(page);
 			BUG_ON(PageWriteback(page));
 
-			/*
-			 * If we have inline data and arrive here, it means that
-			 * we will soon create the block for the 1st page, so
-			 * we'd better clear the inline data here.
-			 */
-			if (ext4_has_inline_data(inode)) {
-				BUG_ON(ext4_test_inode_state(inode,
-						EXT4_STATE_MAY_INLINE_DATA));
-				ext4_destroy_inline_data(handle, inode);
-			}
-
-			if (mpd->next_page != page->index)
+			if (!first_page_found) {
 				mpd->first_page = page->index;
+				first_page_found = true;
+			}
 			mpd->next_page = page->index + 1;
-			logical = (sector_t) page->index <<
-				(PAGE_CACHE_SHIFT - inode->i_blkbits);
+			lblk = ((ext4_lblk_t)page->index) <<
+				(PAGE_CACHE_SHIFT - blkbits);
 
 			/* Add all dirty buffers to mpd */
 			head = page_buffers(page);
-			bh = head;
-			do {
-				BUG_ON(buffer_locked(bh));
-				/*
-				 * We need to try to allocate unmapped blocks
-				 * in the same page.  Otherwise we won't make
-				 * progress with the page in ext4_writepage
-				 */
-				if (ext4_bh_delay_or_unwritten(NULL, bh)) {
-					mpage_add_bh_to_extent(mpd, logical,
-							       bh->b_state);
-					if (mpd->io_done)
-						goto ret_extent_tail;
-				} else if (buffer_dirty(bh) &&
-					   buffer_mapped(bh)) {
-					/*
-					 * mapped dirty buffer. We need to
-					 * update the b_state because we look
-					 * at b_state in mpage_da_map_blocks.
-					 * We don't update b_size because if we
-					 * find an unmapped buffer_head later
-					 * we need to use the b_state flag of
-					 * that buffer_head.
-					 */
-					if (mpd->b_size == 0)
-						mpd->b_state =
-							bh->b_state & BH_FLAGS;
-				}
-				logical++;
-			} while ((bh = bh->b_this_page) != head);
-
-			if (nr_to_write > 0) {
-				nr_to_write--;
-				if (nr_to_write == 0 &&
-				    wbc->sync_mode == WB_SYNC_NONE)
-					/*
-					 * We stop writing back only if we are
-					 * not doing integrity sync. In case of
-					 * integrity sync we have to keep going
-					 * because someone may be concurrently
-					 * dirtying pages, and we might have
-					 * synced a lot of newly appeared dirty
-					 * pages, but have not synced all of the
-					 * old dirty pages.
-					 */
+			if (!add_page_bufs_to_extent(mpd, head, head, lblk))
+				goto out;
+			/* So far everything mapped? Submit the page for IO. */
+			if (mpd->map.m_len == 0) {
+				err = mpage_submit_page(mpd, page);
+				if (err < 0)
 					goto out;
 			}
+
+			/*
+			 * Accumulated enough dirty pages? This doesn't apply
+			 * to WB_SYNC_ALL mode. For integrity sync we have to
+			 * keep going because someone may be concurrently
+			 * dirtying pages, and we might have synced a lot of
+			 * newly appeared dirty pages, but have not synced all
+			 * of the old dirty pages.
+			 */
+			if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
+			    mpd->next_page - mpd->first_page >=
+							mpd->wbc->nr_to_write)
+				goto out;
 		}
 		pagevec_release(&pvec);
 		cond_resched();
 	}
 	return 0;
-ret_extent_tail:
-	ret = MPAGE_DA_EXTENT_TAIL;
 out:
 	pagevec_release(&pvec);
-	cond_resched();
-	return ret;
+	return err;
 }
 
-
 static int ext4_da_writepages(struct address_space *mapping,
 			      struct writeback_control *wbc)
 {
-	pgoff_t	index;
+	pgoff_t	writeback_index = 0;
+	long nr_to_write = wbc->nr_to_write;
 	int range_whole = 0;
+	int cycled = 1;
 	handle_t *handle = NULL;
 	struct mpage_da_data mpd;
 	struct inode *inode = mapping->host;
-	int pages_written = 0;
-	int range_cyclic, cycled = 1, io_done = 0;
 	int needed_blocks, ret = 0;
-	loff_t range_start = wbc->range_start;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
-	pgoff_t done_index = 0;
-	pgoff_t end;
+	bool done;
 	struct blk_plug plug;
 
 	trace_ext4_da_writepages(inode, wbc);
@@ -2298,40 +2240,65 @@  static int ext4_da_writepages(struct address_space *mapping,
 	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
 		return -EROFS;
 
+	/*
+	 * If we have inline data and arrive here, it means that
+	 * we will soon create the block for the 1st page, so
+	 * we'd better clear the inline data here.
+	 */
+	if (ext4_has_inline_data(inode)) {
+		/* Just inode will be modified... */
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out_writepages;
+		}
+		BUG_ON(ext4_test_inode_state(inode,
+				EXT4_STATE_MAY_INLINE_DATA));
+		ext4_destroy_inline_data(handle, inode);
+		ext4_journal_stop(handle);
+	}
+
 	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 		range_whole = 1;
 
-	range_cyclic = wbc->range_cyclic;
 	if (wbc->range_cyclic) {
-		index = mapping->writeback_index;
-		if (index)
+		writeback_index = mapping->writeback_index;
+		if (writeback_index)
 			cycled = 0;
-		wbc->range_start = index << PAGE_CACHE_SHIFT;
-		wbc->range_end  = LLONG_MAX;
-		wbc->range_cyclic = 0;
-		end = -1;
+		mpd.first_page = writeback_index;
+		mpd.last_page = -1;
 	} else {
-		index = wbc->range_start >> PAGE_CACHE_SHIFT;
-		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
+		mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
 	}
 
+	mpd.inode = inode;
+	mpd.wbc = wbc;
+	ext4_io_submit_init(&mpd.io_submit, wbc);
 retry:
 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
-		tag_pages_for_writeback(mapping, index, end);
-
+		tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+	done = false;
 	blk_start_plug(&plug);
-	while (!ret && wbc->nr_to_write > 0) {
+	while (!done && mpd.first_page <= mpd.last_page) {
+		/* For each extent of pages we use new io_end */
+		mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+		if (!mpd.io_submit.io_end) {
+			ret = -ENOMEM;
+			break;
+		}
 
 		/*
-		 * we  insert one extent at a time. So we need
-		 * credit needed for single extent allocation.
-		 * journalled mode is currently not supported
-		 * by delalloc
+		 * We have two constraints: We find one extent to map and we
+		 * must always write out whole page (makes a difference when
+		 * blocksize < pagesize) so that we don't block on IO when we
+		 * try to write out the rest of the page. Journalled mode is
+		 * not supported by delalloc.
 		 */
 		BUG_ON(ext4_should_journal_data(inode));
 		needed_blocks = ext4_da_writepages_trans_blocks(inode);
 
-		/* start a new transaction*/
+		/* start a new transaction */
 		handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
 					    needed_blocks);
 		if (IS_ERR(handle)) {
@@ -2339,76 +2306,67 @@  retry:
 			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
 			       "%ld pages, ino %lu; err %d", __func__,
 				wbc->nr_to_write, inode->i_ino, ret);
-			blk_finish_plug(&plug);
-			goto out_writepages;
+			/* Release allocated io_end */
+			ext4_put_io_end(mpd.io_submit.io_end);
+			break;
 		}
 
-		/*
-		 * Now call write_cache_pages_da() to find the next
-		 * contiguous region of logical blocks that need
-		 * blocks to be allocated by ext4 and submit them.
-		 */
-		ret = write_cache_pages_da(handle, mapping,
-					   wbc, &mpd, &done_index);
-		/*
-		 * If we have a contiguous extent of pages and we
-		 * haven't done the I/O yet, map the blocks and submit
-		 * them for I/O.
-		 */
-		if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-			mpage_da_map_and_submit(&mpd);
-			ret = MPAGE_DA_EXTENT_TAIL;
+		trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
+		ret = mpage_prepare_extent_to_map(&mpd);
+		if (!ret) {
+			if (mpd.map.m_len)
+				ret = mpage_map_and_submit_extent(handle, &mpd);
+			else {
+				/*
+				 * We scanned the whole range (or exhausted
+				 * nr_to_write), submitted what was mapped and
+				 * didn't find anything needing mapping. We are
+				 * done.
+				 */
+				done = true;
+			}
 		}
-		trace_ext4_da_write_pages(inode, &mpd);
-		wbc->nr_to_write -= mpd.pages_written;
-
 		ext4_journal_stop(handle);
-
-		if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
-			/* commit the transaction which would
+		/* Submit prepared bio */
+		ext4_io_submit(&mpd.io_submit);
+		/* Unlock pages we didn't use */
+		mpage_release_unused_pages(&mpd, false);
+		/* Drop our io_end reference we got from init */
+		ext4_put_io_end(mpd.io_submit.io_end);
+
+		if (ret == -ENOSPC && sbi->s_journal) {
+			/*
+			 * Commit the transaction which would
 			 * free blocks released in the transaction
 			 * and try again
 			 */
 			jbd2_journal_force_commit_nested(sbi->s_journal);
 			ret = 0;
-		} else if (ret == MPAGE_DA_EXTENT_TAIL) {
-			/*
-			 * Got one extent now try with rest of the pages.
-			 * If mpd.retval is set -EIO, journal is aborted.
-			 * So we don't need to write any more.
-			 */
-			pages_written += mpd.pages_written;
-			ret = mpd.retval;
-			io_done = 1;
-		} else if (wbc->nr_to_write)
-			/*
-			 * There is no more writeout needed
-			 * or we requested for a noblocking writeout
-			 * and we found the device congested
-			 */
+			continue;
+		}
+		/* Fatal error - ENOMEM, EIO... */
+		if (ret)
 			break;
 	}
 	blk_finish_plug(&plug);
-	if (!io_done && !cycled) {
+	if (!ret && !cycled) {
 		cycled = 1;
-		index = 0;
-		wbc->range_start = index << PAGE_CACHE_SHIFT;
-		wbc->range_end  = mapping->writeback_index - 1;
+		mpd.last_page = writeback_index - 1;
+		mpd.first_page = 0;
 		goto retry;
 	}
 
 	/* Update index */
-	wbc->range_cyclic = range_cyclic;
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		/*
-		 * set the writeback_index so that range_cyclic
+		 * Set the writeback_index so that range_cyclic
 		 * mode will write it back later
 		 */
-		mapping->writeback_index = done_index;
+		mapping->writeback_index = mpd.first_page;
 
 out_writepages:
-	wbc->range_start = range_start;
-	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
+	trace_ext4_da_writepages_result(inode, wbc, ret,
+					nr_to_write - wbc->nr_to_write);
 	return ret;
 }
 
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index a601bb3..203dcd5 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -332,43 +332,59 @@  TRACE_EVENT(ext4_da_writepages,
 );
 
 TRACE_EVENT(ext4_da_write_pages,
-	TP_PROTO(struct inode *inode, struct mpage_da_data *mpd),
+	TP_PROTO(struct inode *inode, pgoff_t first_page,
+		 struct writeback_control *wbc),
 
-	TP_ARGS(inode, mpd),
+	TP_ARGS(inode, first_page, wbc),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
-		__field(	__u64,	b_blocknr		)
-		__field(	__u32,	b_size			)
-		__field(	__u32,	b_state			)
-		__field(	unsigned long,	first_page	)
-		__field(	int,	io_done			)
-		__field(	int,	pages_written		)
-		__field(	int,	sync_mode		)
+		__field(      pgoff_t,	first_page		)
+		__field(	 long,	nr_to_write		)
+		__field(	  int,	sync_mode		)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= inode->i_sb->s_dev;
 		__entry->ino		= inode->i_ino;
-		__entry->b_blocknr	= mpd->b_blocknr;
-		__entry->b_size		= mpd->b_size;
-		__entry->b_state	= mpd->b_state;
-		__entry->first_page	= mpd->first_page;
-		__entry->io_done	= mpd->io_done;
-		__entry->pages_written	= mpd->pages_written;
-		__entry->sync_mode	= mpd->wbc->sync_mode;
+		__entry->first_page	= first_page;
+		__entry->nr_to_write	= wbc->nr_to_write;
+		__entry->sync_mode	= wbc->sync_mode;
 	),
 
-	TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x "
-		  "first_page %lu io_done %d pages_written %d sync_mode %d",
+	TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld "
+		  "sync_mode %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  __entry->b_blocknr, __entry->b_size,
-		  __entry->b_state, __entry->first_page,
-		  __entry->io_done, __entry->pages_written,
-		  __entry->sync_mode
-                  )
+		  (unsigned long) __entry->ino, __entry->first_page,
+		  __entry->nr_to_write, __entry->sync_mode)
+);
+
+TRACE_EVENT(ext4_da_write_pages_extent,
+	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map),
+
+	TP_ARGS(inode, map),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	__u64,	lblk			)
+		__field(	__u32,	len			)
+		__field(	__u32,	flags			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->ino		= inode->i_ino;
+		__entry->lblk		= map->m_lblk;
+		__entry->len		= map->m_len;
+		__entry->flags		= map->m_flags;
+	),
+
+	TP_printk("dev %d,%d ino %lu lblk %llu len %u flags 0x%04x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino, __entry->lblk, __entry->len,
+		  __entry->flags)
 );
 
 TRACE_EVENT(ext4_da_writepages_result,