Patchwork [v3] ext4: Rewrite ext4_page_mkwrite() to use generic helpers

login
register
mail settings
Submitter Jan Kara
Date June 22, 2011, 6:24 p.m.
Message ID <1308767062-27695-1-git-send-email-jack@suse.cz>
Download mbox | patch
Permalink /patch/101533/
State New
Headers show

Comments

Jan Kara - June 22, 2011, 6:24 p.m.
Rewrite ext4_page_mkwrite() to use __block_page_mkwrite() helper. This
removes the need of using i_alloc_sem to avoid races with truncate which
seems to be the wrong locking order according to lock ordering documented in
mm/rmap.c. Also calling ext4_da_write_begin() as used by the old code seems to
be problematic because we can decide to flush delay-allocated blocks which
will acquire s_umount semaphore - again creating unpleasant lock dependency
if not directly a deadlock.

Also add a check for frozen filesystem so that we don't busyloop in page fault
when the filesystem is frozen.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/inode.c |  106 ++++++++++++++++++++++++++++--------------------------
 1 files changed, 55 insertions(+), 51 deletions(-)
Eric Sandeen - June 22, 2011, 7:30 p.m.
On 6/22/11 1:24 PM, Jan Kara wrote:
> Rewrite ext4_page_mkwrite() to use __block_page_mkwrite() helper. This
> removes the need of using i_alloc_sem to avoid races with truncate which
> seems to be the wrong locking order according to lock ordering documented in
> mm/rmap.c. Also calling ext4_da_write_begin() as used by the old code seems to
> be problematic because we can decide to flush delay-allocated blocks which
> will acquire s_umount semaphore - again creating unpleasant lock dependency
> if not directly a deadlock.

I have a customer testcase which reliably locks up with freeze & mmap, and
with this patch in place (and the other 2 higher-level patches that made
it into 3.0-rcX) it passes, FWIW.

-Eric

> Also add a check for frozen filesystem so that we don't busyloop in page fault
> when the filesystem is frozen.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/ext4/inode.c |  106 ++++++++++++++++++++++++++++--------------------------
>  1 files changed, 55 insertions(+), 51 deletions(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index e3126c0..bd30976 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5843,80 +5843,84 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  	struct page *page = vmf->page;
>  	loff_t size;
>  	unsigned long len;
> -	int ret = -EINVAL;
> -	void *fsdata;
> +	int ret;
>  	struct file *file = vma->vm_file;
>  	struct inode *inode = file->f_path.dentry->d_inode;
>  	struct address_space *mapping = inode->i_mapping;
> +	handle_t *handle;
> +	get_block_t *get_block;
> +	int retries = 0;
>  
>  	/*
> -	 * Get i_alloc_sem to stop truncates messing with the inode. We cannot
> -	 * get i_mutex because we are already holding mmap_sem.
> +	 * This check is racy but catches the common case. We rely on
> +	 * __block_page_mkwrite() to do a reliable check.
>  	 */
> -	down_read(&inode->i_alloc_sem);
> -	size = i_size_read(inode);
> -	if (page->mapping != mapping || size <= page_offset(page)
> -	    || !PageUptodate(page)) {
> -		/* page got truncated from under us? */
> -		goto out_unlock;
> +	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
> +	/* Delalloc case is easy... */
> +	if (test_opt(inode->i_sb, DELALLOC) &&
> +	    !ext4_should_journal_data(inode) &&
> +	    !ext4_nonda_switch(inode->i_sb)) {
> +		do {
> +			ret = __block_page_mkwrite(vma, vmf,
> +						   ext4_da_get_block_prep);
> +		} while (ret == -ENOSPC &&
> +		       ext4_should_retry_alloc(inode->i_sb, &retries));
> +		goto out_ret;
>  	}
> -	ret = 0;
>  
>  	lock_page(page);
> -	wait_on_page_writeback(page);
> -	if (PageMappedToDisk(page)) {
> -		up_read(&inode->i_alloc_sem);
> -		return VM_FAULT_LOCKED;
> +	size = i_size_read(inode);
> +	/* Page got truncated from under us? */
> +	if (page->mapping != mapping || page_offset(page) > size) {
> +		unlock_page(page);
> +		ret = VM_FAULT_NOPAGE;
> +		goto out;
>  	}
>  
>  	if (page->index == size >> PAGE_CACHE_SHIFT)
>  		len = size & ~PAGE_CACHE_MASK;
>  	else
>  		len = PAGE_CACHE_SIZE;
> -
>  	/*
> -	 * return if we have all the buffers mapped. This avoid
> -	 * the need to call write_begin/write_end which does a
> -	 * journal_start/journal_stop which can block and take
> -	 * long time
> +	 * Return if we have all the buffers mapped. This avoids the need to do
> +	 * journal_start/journal_stop which can block and take a long time
>  	 */
>  	if (page_has_buffers(page)) {
>  		if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
>  					ext4_bh_unmapped)) {
> -			up_read(&inode->i_alloc_sem);
> -			return VM_FAULT_LOCKED;
> +			/* Wait so that we don't change page under IO */
> +			wait_on_page_writeback(page);
> +			ret = VM_FAULT_LOCKED;
> +			goto out;
>  		}
>  	}
>  	unlock_page(page);
> -	/*
> -	 * OK, we need to fill the hole... Do write_begin write_end
> -	 * to do block allocation/reservation.We are not holding
> -	 * inode.i__mutex here. That allow * parallel write_begin,
> -	 * write_end call. lock_page prevent this from happening
> -	 * on the same page though
> -	 */
> -	ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
> -			len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
> -	if (ret < 0)
> -		goto out_unlock;
> -	ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
> -			len, len, page, fsdata);
> -	if (ret < 0)
> -		goto out_unlock;
> -	ret = 0;
> -
> -	/*
> -	 * write_begin/end might have created a dirty page and someone
> -	 * could wander in and start the IO.  Make sure that hasn't
> -	 * happened.
> -	 */
> -	lock_page(page);
> -	wait_on_page_writeback(page);
> -	up_read(&inode->i_alloc_sem);
> -	return VM_FAULT_LOCKED;
> -out_unlock:
> -	if (ret)
> +	/* OK, we need to fill the hole... */
> +	if (ext4_should_dioread_nolock(inode))
> +		get_block = ext4_get_block_write;
> +	else
> +		get_block = ext4_get_block;
> +retry_alloc:
> +	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
> +	if (IS_ERR(handle)) {
>  		ret = VM_FAULT_SIGBUS;
> -	up_read(&inode->i_alloc_sem);
> +		goto out;
> +	}
> +	ret = __block_page_mkwrite(vma, vmf, get_block);
> +	if (!ret && ext4_should_journal_data(inode)) {
> +		if (walk_page_buffers(handle, page_buffers(page), 0,
> +			  PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
> +			unlock_page(page);
> +			ret = VM_FAULT_SIGBUS;
> +			goto out;
> +		}
> +		ext4_set_inode_state(inode, EXT4_STATE_JDATA);
> +	}
> +	ext4_journal_stop(handle);
> +	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
> +		goto retry_alloc;
> +out_ret:
> +	ret = block_page_mkwrite_return(ret);
> +out:
>  	return ret;
>  }

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig - June 23, 2011, 10:39 a.m.
>  	loff_t size;
>  	unsigned long len;
> +	int ret;
>  	struct file *file = vma->vm_file;
>  	struct inode *inode = file->f_path.dentry->d_inode;
>  	struct address_space *mapping = inode->i_mapping;
> +	handle_t *handle;
> +	get_block_t *get_block;
> +	int retries = 0;
>  
>  	/*
> +	 * This check is racy but catches the common case. We rely on
> +	 * __block_page_mkwrite() to do a reliable check.
>  	 */
> +	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
> +	/* Delalloc case is easy... */
> +	if (test_opt(inode->i_sb, DELALLOC) &&
> +	    !ext4_should_journal_data(inode) &&
> +	    !ext4_nonda_switch(inode->i_sb)) {
> +		do {
> +			ret = __block_page_mkwrite(vma, vmf,
> +						   ext4_da_get_block_prep);
> +		} while (ret == -ENOSPC &&
> +		       ext4_should_retry_alloc(inode->i_sb, &retries));
> +		goto out_ret;

Is there any way to simply provide a different vm_operations_struct
and thus ->fault implementation for the delalloc vs non-delalloc case?

I think splitting those two cases completely would make the code a lot
more readable, even if there is a tiny amount of code duplication.

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara - June 23, 2011, 11:09 a.m.
On Thu 23-06-11 06:39:37, Christoph Hellwig wrote:
> >  	loff_t size;
> >  	unsigned long len;
> > +	int ret;
> >  	struct file *file = vma->vm_file;
> >  	struct inode *inode = file->f_path.dentry->d_inode;
> >  	struct address_space *mapping = inode->i_mapping;
> > +	handle_t *handle;
> > +	get_block_t *get_block;
> > +	int retries = 0;
> >  
> >  	/*
> > +	 * This check is racy but catches the common case. We rely on
> > +	 * __block_page_mkwrite() to do a reliable check.
> >  	 */
> > +	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
> > +	/* Delalloc case is easy... */
> > +	if (test_opt(inode->i_sb, DELALLOC) &&
> > +	    !ext4_should_journal_data(inode) &&
> > +	    !ext4_nonda_switch(inode->i_sb)) {
> > +		do {
> > +			ret = __block_page_mkwrite(vma, vmf,
> > +						   ext4_da_get_block_prep);
> > +		} while (ret == -ENOSPC &&
> > +		       ext4_should_retry_alloc(inode->i_sb, &retries));
> > +		goto out_ret;
> 
> Is there any way to simply provide a different vm_operations_struct
> and thus ->fault implementation for the delalloc vs non-delalloc case?
> 
> I think splitting those two cases completely would make the code a lot
> more readable, even if there is a tiny amount of code duplication.
  The trouble is that ext4 falls backs to standard allocation instead of
delayed allocation when we run low on free space. So complete separation
of these two cases is not possible. Also inode data journal flag can be
set on the fly so we have to check it even in delalloc mkwrite
implementation.

So what we could do is to have:
ext4_da_page_mkwrite() which will fall back to ext4_page_mkwrite() in
some special cases. Not sure how big readability win that is...

								Honza

PS: I've just realized that ext4_nonda_switch() used in the new code still
calls writeback code so my comment about ext4_da_write_begin() in the
change log is kind of moot. Can you please remove that sentence? Thanks.
Christoph Hellwig - June 23, 2011, 11:51 a.m.
On Thu, Jun 23, 2011 at 01:09:18PM +0200, Jan Kara wrote:
> > more readable, even if there is a tiny amount of code duplication.
>   The trouble is that ext4 falls backs to standard allocation instead of
> delayed allocation when we run low on free space. So complete separation
> of these two cases is not possible. Also inode data journal flag can be
> set on the fly so we have to check it even in delalloc mkwrite
> implementation.

Yikes!  Ok, in that case there's no benefit in doing the split.

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e3126c0..bd30976 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5843,80 +5843,84 @@  int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct page *page = vmf->page;
 	loff_t size;
 	unsigned long len;
-	int ret = -EINVAL;
-	void *fsdata;
+	int ret;
 	struct file *file = vma->vm_file;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct address_space *mapping = inode->i_mapping;
+	handle_t *handle;
+	get_block_t *get_block;
+	int retries = 0;
 
 	/*
-	 * Get i_alloc_sem to stop truncates messing with the inode. We cannot
-	 * get i_mutex because we are already holding mmap_sem.
+	 * This check is racy but catches the common case. We rely on
+	 * __block_page_mkwrite() to do a reliable check.
 	 */
-	down_read(&inode->i_alloc_sem);
-	size = i_size_read(inode);
-	if (page->mapping != mapping || size <= page_offset(page)
-	    || !PageUptodate(page)) {
-		/* page got truncated from under us? */
-		goto out_unlock;
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+	/* Delalloc case is easy... */
+	if (test_opt(inode->i_sb, DELALLOC) &&
+	    !ext4_should_journal_data(inode) &&
+	    !ext4_nonda_switch(inode->i_sb)) {
+		do {
+			ret = __block_page_mkwrite(vma, vmf,
+						   ext4_da_get_block_prep);
+		} while (ret == -ENOSPC &&
+		       ext4_should_retry_alloc(inode->i_sb, &retries));
+		goto out_ret;
 	}
-	ret = 0;
 
 	lock_page(page);
-	wait_on_page_writeback(page);
-	if (PageMappedToDisk(page)) {
-		up_read(&inode->i_alloc_sem);
-		return VM_FAULT_LOCKED;
+	size = i_size_read(inode);
+	/* Page got truncated from under us? */
+	if (page->mapping != mapping || page_offset(page) > size) {
+		unlock_page(page);
+		ret = VM_FAULT_NOPAGE;
+		goto out;
 	}
 
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
 	else
 		len = PAGE_CACHE_SIZE;
-
 	/*
-	 * return if we have all the buffers mapped. This avoid
-	 * the need to call write_begin/write_end which does a
-	 * journal_start/journal_stop which can block and take
-	 * long time
+	 * Return if we have all the buffers mapped. This avoids the need to do
+	 * journal_start/journal_stop which can block and take a long time
 	 */
 	if (page_has_buffers(page)) {
 		if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
 					ext4_bh_unmapped)) {
-			up_read(&inode->i_alloc_sem);
-			return VM_FAULT_LOCKED;
+			/* Wait so that we don't change page under IO */
+			wait_on_page_writeback(page);
+			ret = VM_FAULT_LOCKED;
+			goto out;
 		}
 	}
 	unlock_page(page);
-	/*
-	 * OK, we need to fill the hole... Do write_begin write_end
-	 * to do block allocation/reservation.We are not holding
-	 * inode.i__mutex here. That allow * parallel write_begin,
-	 * write_end call. lock_page prevent this from happening
-	 * on the same page though
-	 */
-	ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
-			len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
-	if (ret < 0)
-		goto out_unlock;
-	ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
-			len, len, page, fsdata);
-	if (ret < 0)
-		goto out_unlock;
-	ret = 0;
-
-	/*
-	 * write_begin/end might have created a dirty page and someone
-	 * could wander in and start the IO.  Make sure that hasn't
-	 * happened.
-	 */
-	lock_page(page);
-	wait_on_page_writeback(page);
-	up_read(&inode->i_alloc_sem);
-	return VM_FAULT_LOCKED;
-out_unlock:
-	if (ret)
+	/* OK, we need to fill the hole... */
+	if (ext4_should_dioread_nolock(inode))
+		get_block = ext4_get_block_write;
+	else
+		get_block = ext4_get_block;
+retry_alloc:
+	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+	if (IS_ERR(handle)) {
 		ret = VM_FAULT_SIGBUS;
-	up_read(&inode->i_alloc_sem);
+		goto out;
+	}
+	ret = __block_page_mkwrite(vma, vmf, get_block);
+	if (!ret && ext4_should_journal_data(inode)) {
+		if (walk_page_buffers(handle, page_buffers(page), 0,
+			  PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
+			unlock_page(page);
+			ret = VM_FAULT_SIGBUS;
+			goto out;
+		}
+		ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+	}
+	ext4_journal_stop(handle);
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry_alloc;
+out_ret:
+	ret = block_page_mkwrite_return(ret);
+out:
 	return ret;
 }