diff mbox

[3/3] fs: Fix remaining filesystems to wait for stable page writeback

Message ID 20121101075829.16153.92036.stgit@blackbox.djwong.org
State Not Applicable, archived
Headers show

Commit Message

Darrick Wong Nov. 1, 2012, 7:58 a.m. UTC
Fix up the filesystems that provide their own ->page_mkwrite handlers to
provide stable page writes if necessary.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/9p/vfs_file.c |    1 +
 fs/afs/write.c   |    4 ++--
 fs/ceph/addr.c   |    1 +
 fs/cifs/file.c   |    1 +
 fs/ocfs2/mmap.c  |    1 +
 fs/ubifs/file.c  |    4 ++--
 6 files changed, 8 insertions(+), 4 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Jan Kara Nov. 1, 2012, 12:36 p.m. UTC | #1
On Thu 01-11-12 00:58:29, Darrick J. Wong wrote:
> Fix up the filesystems that provide their own ->page_mkwrite handlers to
> provide stable page writes if necessary.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/9p/vfs_file.c |    1 +
>  fs/afs/write.c   |    4 ++--
>  fs/ceph/addr.c   |    1 +
>  fs/cifs/file.c   |    1 +
>  fs/ocfs2/mmap.c  |    1 +
>  fs/ubifs/file.c  |    4 ++--
>  6 files changed, 8 insertions(+), 4 deletions(-)
> 
> 
> diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
> index c2483e9..aa253f0 100644
> --- a/fs/9p/vfs_file.c
> +++ b/fs/9p/vfs_file.c
> @@ -620,6 +620,7 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  	lock_page(page);
>  	if (page->mapping != inode->i_mapping)
>  		goto out_unlock;
> +	wait_on_stable_page_write(page);
>  
>  	return VM_FAULT_LOCKED;
>  out_unlock:
> diff --git a/fs/afs/write.c b/fs/afs/write.c
> index 9aa52d9..39eb2a4 100644
> --- a/fs/afs/write.c
> +++ b/fs/afs/write.c
> @@ -758,7 +758,7 @@ int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
>  #ifdef CONFIG_AFS_FSCACHE
>  	fscache_wait_on_page_write(vnode->cache, page);
>  #endif
> -
> +	wait_on_stable_page_write(page);
>  	_leave(" = 0");
> -	return 0;
> +	return VM_FAULT_LOCKED;
>  }
  Oh, I missed these two since I've got confused by
fscache_wait_on_page_write().

> diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
> index 47a87dd..a0027b1 100644
> --- a/fs/ocfs2/mmap.c
> +++ b/fs/ocfs2/mmap.c
> @@ -124,6 +124,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
>  				     fsdata);
>  	BUG_ON(ret != len);
>  	ret = VM_FAULT_LOCKED;
> +	wait_on_stable_page_write(page);
>  out:
>  	return ret;
>  }
  Actually, this is not so easy. ocfs2 doesn't use
grab_cache_page_write_begin() so you have to modify write_begin() path as
well. And then you don't have to modify __ocfs2_page_mkwrite() because it
uses ocfs2_write_begin(). Preferably teach it to use
grab_cache_page_write_begin()...

  And I think you are missing ncpfs. Because ncp_file_mmap does not set
.mkwrite - it should use filemap_page_mkwrite() I think.

								Honza
Boaz Harrosh Nov. 1, 2012, 6:43 p.m. UTC | #2
On 11/01/2012 12:58 AM, Darrick J. Wong wrote:
> Fix up the filesystems that provide their own ->page_mkwrite handlers to
> provide stable page writes if necessary.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/9p/vfs_file.c |    1 +
>  fs/afs/write.c   |    4 ++--
>  fs/ceph/addr.c   |    1 +
>  fs/cifs/file.c   |    1 +
>  fs/ocfs2/mmap.c  |    1 +
>  fs/ubifs/file.c  |    4 ++--
>  6 files changed, 8 insertions(+), 4 deletions(-)
> 
> 
> diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
> index c2483e9..aa253f0 100644
> --- a/fs/9p/vfs_file.c
> +++ b/fs/9p/vfs_file.c
> @@ -620,6 +620,7 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  	lock_page(page);
>  	if (page->mapping != inode->i_mapping)
>  		goto out_unlock;
> +	wait_on_stable_page_write(page);
>  

Good god thanks, yes please ;-)

>  	return VM_FAULT_LOCKED;
>  out_unlock:
> diff --git a/fs/afs/write.c b/fs/afs/write.c
> index 9aa52d9..39eb2a4 100644
> --- a/fs/afs/write.c
> +++ b/fs/afs/write.c
> @@ -758,7 +758,7 @@ int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)

afs, is it not a network filesystem? which means that it has it's own emulated none-block-device
BDI, registered internally. So if you do need stable pages someone should call
bdi_require_stable_pages()

But again since it is a network filesystem I don't see how it is needed, and/or it might be
taken care of already.

>  #ifdef CONFIG_AFS_FSCACHE
>  	fscache_wait_on_page_write(vnode->cache, page);
>  #endif
> -
> +	wait_on_stable_page_write(page);
>  	_leave(" = 0");
> -	return 0;
> +	return VM_FAULT_LOCKED;
>  }
> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c

CEPH for sure has it's own "emulated none-block-device BDI". This one is also
a pure networking filesystem.

And it already does what it needs to do with wait_on_writeback().

So i do not think you should touch CEPH

> index 6690269..e9734bf 100644
> --- a/fs/ceph/addr.c
> +++ b/fs/ceph/addr.c
> @@ -1208,6 +1208,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  		set_page_dirty(page);
>  		up_read(&mdsc->snap_rwsem);
>  		ret = VM_FAULT_LOCKED;
> +		wait_on_stable_page_write(page);
>  	} else {
>  		if (ret == -ENOMEM)
>  			ret = VM_FAULT_OOM;
> diff --git a/fs/cifs/file.c b/fs/cifs/file.c

Cifs also self-BDI network filesystem, but

> index edb25b4..a8770bf 100644
> --- a/fs/cifs/file.c
> +++ b/fs/cifs/file.c
> @@ -2997,6 +2997,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  	struct page *page = vmf->page;
>  
>  	lock_page(page);

It waits by locking the page, that's cifs naive way of waiting for writeback

> +	wait_on_stable_page_write(page);

Instead it could do better and not override page_mkwrite at all, and all it needs
to do is call bdi_require_stable_pages() at it's own registered BDI

>  	return VM_FAULT_LOCKED;
>  }
>  
> diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
> index 47a87dd..a0027b1 100644
> --- a/fs/ocfs2/mmap.c
> +++ b/fs/ocfs2/mmap.c
> @@ -124,6 +124,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
>  				     fsdata);
>  	BUG_ON(ret != len);
>  	ret = VM_FAULT_LOCKED;
> +	wait_on_stable_page_write(page);
>  out:
>  	return ret;
>  }
> diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
> index 5bc7781..cb0d3aa 100644
> --- a/fs/ubifs/file.c
> +++ b/fs/ubifs/file.c
> @@ -1522,8 +1522,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
>  			ubifs_release_dirty_inode_budget(c, ui);
>  	}
>  
> -	unlock_page(page);
> -	return 0;
> +	wait_on_stable_page_write(page);

ubifs has it's special ubi block device. So someone needs to call bdi_require_stable_pages()
for this to work.

I think that here too. The existing code, like cifs, calls page_lock, as a way of
waiting for writeback.

So this is certainly not finished.

> +	return VM_FAULT_LOCKED;
>  
>  out_unlock:
>  	unlock_page(page);
> 

Cheers
Boaz
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Nov. 1, 2012, 8:22 p.m. UTC | #3
On Thu, 1 Nov 2012 11:43:26 -0700
Boaz Harrosh <bharrosh@panasas.com> wrote:

> On 11/01/2012 12:58 AM, Darrick J. Wong wrote:
> > Fix up the filesystems that provide their own ->page_mkwrite handlers to
> > provide stable page writes if necessary.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/9p/vfs_file.c |    1 +
> >  fs/afs/write.c   |    4 ++--
> >  fs/ceph/addr.c   |    1 +
> >  fs/cifs/file.c   |    1 +
> >  fs/ocfs2/mmap.c  |    1 +
> >  fs/ubifs/file.c  |    4 ++--
> >  6 files changed, 8 insertions(+), 4 deletions(-)
> > 
> > 
> > diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
> > index c2483e9..aa253f0 100644
> > --- a/fs/9p/vfs_file.c
> > +++ b/fs/9p/vfs_file.c
> > @@ -620,6 +620,7 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
> >  	lock_page(page);
> >  	if (page->mapping != inode->i_mapping)
> >  		goto out_unlock;
> > +	wait_on_stable_page_write(page);
> >  
> 
> Good god thanks, yes please ;-)
> 
> >  	return VM_FAULT_LOCKED;
> >  out_unlock:
> > diff --git a/fs/afs/write.c b/fs/afs/write.c
> > index 9aa52d9..39eb2a4 100644
> > --- a/fs/afs/write.c
> > +++ b/fs/afs/write.c
> > @@ -758,7 +758,7 @@ int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
> 
> afs, is it not a network filesystem? which means that it has it's own emulated none-block-device
> BDI, registered internally. So if you do need stable pages someone should call
> bdi_require_stable_pages()
> 
> But again since it is a network filesystem I don't see how it is needed, and/or it might be
> taken care of already.
> 
> >  #ifdef CONFIG_AFS_FSCACHE
> >  	fscache_wait_on_page_write(vnode->cache, page);
> >  #endif
> > -
> > +	wait_on_stable_page_write(page);
> >  	_leave(" = 0");
> > -	return 0;
> > +	return VM_FAULT_LOCKED;
> >  }
> > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> 
> CEPH for sure has it's own "emulated none-block-device BDI". This one is also
> a pure networking filesystem.
> 
> And it already does what it needs to do with wait_on_writeback().
> 
> So i do not think you should touch CEPH
> 
> > index 6690269..e9734bf 100644
> > --- a/fs/ceph/addr.c
> > +++ b/fs/ceph/addr.c
> > @@ -1208,6 +1208,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
> >  		set_page_dirty(page);
> >  		up_read(&mdsc->snap_rwsem);
> >  		ret = VM_FAULT_LOCKED;
> > +		wait_on_stable_page_write(page);
> >  	} else {
> >  		if (ret == -ENOMEM)
> >  			ret = VM_FAULT_OOM;
> > diff --git a/fs/cifs/file.c b/fs/cifs/file.c
> 
> Cifs also self-BDI network filesystem, but
> 
> > index edb25b4..a8770bf 100644
> > --- a/fs/cifs/file.c
> > +++ b/fs/cifs/file.c
> > @@ -2997,6 +2997,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
> >  	struct page *page = vmf->page;
> >  
> >  	lock_page(page);
> 
> It waits by locking the page, that's cifs naive way of waiting for writeback
> 
> > +	wait_on_stable_page_write(page);
> 
> Instead it could do better and not override page_mkwrite at all, and all it needs
> to do is call bdi_require_stable_pages() at it's own registered BDI
> 

Hmm...I don't know...

I've never been crazy about using the page lock for this, but in the
absence of a better way to guarantee stable pages, it was what I ended
up with at the time. cifs_writepages will hold the page lock until
kernel_sendmsg returns. At that point the TCP layer will have copied
off the page data so it's safe to release it.

With this change though, we're going to end up blocking until the
writeback flag clears, right? And I think that will happen when the
reply comes in? So, we'll end up blocking for much longer than is
really necessary in page_mkwrite with this change.
Boaz Harrosh Nov. 1, 2012, 10:23 p.m. UTC | #4
On 11/01/2012 01:22 PM, Jeff Layton wrote:
> Hmm...I don't know...
> 
> I've never been crazy about using the page lock for this, but in the
> absence of a better way to guarantee stable pages, it was what I ended
> up with at the time. cifs_writepages will hold the page lock until
> kernel_sendmsg returns. At that point the TCP layer will have copied
> off the page data so it's safe to release it.
> 
> With this change though, we're going to end up blocking until the
> writeback flag clears, right? And I think that will happen when the
> reply comes in? So, we'll end up blocking for much longer than is
> really necessary in page_mkwrite with this change.
> 

Hmm OK, that is a very good point. In that case it is just a simple
nack on Darrick's hunk to cifs. cifs is fine and should not be touched

Boaz
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick Wong Nov. 1, 2012, 10:47 p.m. UTC | #5
On Thu, Nov 01, 2012 at 04:22:54PM -0400, Jeff Layton wrote:
> On Thu, 1 Nov 2012 11:43:26 -0700
> Boaz Harrosh <bharrosh@panasas.com> wrote:
> 
> > On 11/01/2012 12:58 AM, Darrick J. Wong wrote:
> > > Fix up the filesystems that provide their own ->page_mkwrite handlers to
> > > provide stable page writes if necessary.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > >  fs/9p/vfs_file.c |    1 +
> > >  fs/afs/write.c   |    4 ++--
> > >  fs/ceph/addr.c   |    1 +
> > >  fs/cifs/file.c   |    1 +
> > >  fs/ocfs2/mmap.c  |    1 +
> > >  fs/ubifs/file.c  |    4 ++--
> > >  6 files changed, 8 insertions(+), 4 deletions(-)
> > > 
> > > 
> > > diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
> > > index c2483e9..aa253f0 100644
> > > --- a/fs/9p/vfs_file.c
> > > +++ b/fs/9p/vfs_file.c
> > > @@ -620,6 +620,7 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
> > >  	lock_page(page);
> > >  	if (page->mapping != inode->i_mapping)
> > >  		goto out_unlock;
> > > +	wait_on_stable_page_write(page);
> > >  
> > 
> > Good god thanks, yes please ;-)
> > 
> > >  	return VM_FAULT_LOCKED;
> > >  out_unlock:
> > > diff --git a/fs/afs/write.c b/fs/afs/write.c
> > > index 9aa52d9..39eb2a4 100644
> > > --- a/fs/afs/write.c
> > > +++ b/fs/afs/write.c
> > > @@ -758,7 +758,7 @@ int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
> > 
> > afs, is it not a network filesystem? which means that it has it's own emulated none-block-device
> > BDI, registered internally. So if you do need stable pages someone should call
> > bdi_require_stable_pages()
> > 
> > But again since it is a network filesystem I don't see how it is needed, and/or it might be
> > taken care of already.
> > 
> > >  #ifdef CONFIG_AFS_FSCACHE
> > >  	fscache_wait_on_page_write(vnode->cache, page);
> > >  #endif
> > > -
> > > +	wait_on_stable_page_write(page);
> > >  	_leave(" = 0");
> > > -	return 0;
> > > +	return VM_FAULT_LOCKED;
> > >  }
> > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> > 
> > CEPH for sure has it's own "emulated none-block-device BDI". This one is also
> > a pure networking filesystem.
> > 
> > And it already does what it needs to do with wait_on_writeback().
> > 
> > So i do not think you should touch CEPH
> > 
> > > index 6690269..e9734bf 100644
> > > --- a/fs/ceph/addr.c
> > > +++ b/fs/ceph/addr.c
> > > @@ -1208,6 +1208,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
> > >  		set_page_dirty(page);
> > >  		up_read(&mdsc->snap_rwsem);
> > >  		ret = VM_FAULT_LOCKED;
> > > +		wait_on_stable_page_write(page);
> > >  	} else {
> > >  		if (ret == -ENOMEM)
> > >  			ret = VM_FAULT_OOM;
> > > diff --git a/fs/cifs/file.c b/fs/cifs/file.c
> > 
> > Cifs also self-BDI network filesystem, but
> > 
> > > index edb25b4..a8770bf 100644
> > > --- a/fs/cifs/file.c
> > > +++ b/fs/cifs/file.c
> > > @@ -2997,6 +2997,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
> > >  	struct page *page = vmf->page;
> > >  
> > >  	lock_page(page);
> > 
> > It waits by locking the page, that's cifs naive way of waiting for writeback
> > 
> > > +	wait_on_stable_page_write(page);
> > 
> > Instead it could do better and not override page_mkwrite at all, and all it needs
> > to do is call bdi_require_stable_pages() at it's own registered BDI
> > 
> 
> Hmm...I don't know...
> 
> I've never been crazy about using the page lock for this, but in the
> absence of a better way to guarantee stable pages, it was what I ended
> up with at the time. cifs_writepages will hold the page lock until
> kernel_sendmsg returns. At that point the TCP layer will have copied
> off the page data so it's safe to release it.
> 
> With this change though, we're going to end up blocking until the
> writeback flag clears, right? And I think that will happen when the
> reply comes in? So, we'll end up blocking for much longer than is
> really necessary in page_mkwrite with this change.

That's a very good point to make-- network FSes can stop the stable-waiting
after the request is sent.  Can I interest you in a new page flag (PG_stable)
that indicates when a page has to be held for stable write?  Along with a
modification to wait_on_stable_page_write that uses the new PG_stable flag
instead of just writeback?  Then, you can clear PG_stable right after the
sendmsg() and release the page for further activity without having to overload
the page lock.

I wrote a patch that does exactly that as part of my work to defer the
integrity checksumming until the last possible instant.  However, I haven't
gotten that part to work yet, so I left the PG_stable patch out of this
submission.  On the other hand, it sounds like you could use it.

--D
> 
> -- 
> Jeff Layton <jlayton@samba.org>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Nov. 2, 2012, 12:36 a.m. UTC | #6
On Thu, 1 Nov 2012 15:47:30 -0700
"Darrick J. Wong" <darrick.wong@oracle.com> wrote:

> On Thu, Nov 01, 2012 at 04:22:54PM -0400, Jeff Layton wrote:
> > On Thu, 1 Nov 2012 11:43:26 -0700
> > Boaz Harrosh <bharrosh@panasas.com> wrote:
> > 
> > > On 11/01/2012 12:58 AM, Darrick J. Wong wrote:
> > > > Fix up the filesystems that provide their own ->page_mkwrite handlers to
> > > > provide stable page writes if necessary.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > ---
> > > >  fs/9p/vfs_file.c |    1 +
> > > >  fs/afs/write.c   |    4 ++--
> > > >  fs/ceph/addr.c   |    1 +
> > > >  fs/cifs/file.c   |    1 +
> > > >  fs/ocfs2/mmap.c  |    1 +
> > > >  fs/ubifs/file.c  |    4 ++--
> > > >  6 files changed, 8 insertions(+), 4 deletions(-)
> > > > 
> > > > 
> > > > diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
> > > > index c2483e9..aa253f0 100644
> > > > --- a/fs/9p/vfs_file.c
> > > > +++ b/fs/9p/vfs_file.c
> > > > @@ -620,6 +620,7 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
> > > >  	lock_page(page);
> > > >  	if (page->mapping != inode->i_mapping)
> > > >  		goto out_unlock;
> > > > +	wait_on_stable_page_write(page);
> > > >  
> > > 
> > > Good god thanks, yes please ;-)
> > > 
> > > >  	return VM_FAULT_LOCKED;
> > > >  out_unlock:
> > > > diff --git a/fs/afs/write.c b/fs/afs/write.c
> > > > index 9aa52d9..39eb2a4 100644
> > > > --- a/fs/afs/write.c
> > > > +++ b/fs/afs/write.c
> > > > @@ -758,7 +758,7 @@ int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
> > > 
> > > afs, is it not a network filesystem? which means that it has it's own emulated none-block-device
> > > BDI, registered internally. So if you do need stable pages someone should call
> > > bdi_require_stable_pages()
> > > 
> > > But again since it is a network filesystem I don't see how it is needed, and/or it might be
> > > taken care of already.
> > > 
> > > >  #ifdef CONFIG_AFS_FSCACHE
> > > >  	fscache_wait_on_page_write(vnode->cache, page);
> > > >  #endif
> > > > -
> > > > +	wait_on_stable_page_write(page);
> > > >  	_leave(" = 0");
> > > > -	return 0;
> > > > +	return VM_FAULT_LOCKED;
> > > >  }
> > > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> > > 
> > > CEPH for sure has it's own "emulated none-block-device BDI". This one is also
> > > a pure networking filesystem.
> > > 
> > > And it already does what it needs to do with wait_on_writeback().
> > > 
> > > So i do not think you should touch CEPH
> > > 
> > > > index 6690269..e9734bf 100644
> > > > --- a/fs/ceph/addr.c
> > > > +++ b/fs/ceph/addr.c
> > > > @@ -1208,6 +1208,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
> > > >  		set_page_dirty(page);
> > > >  		up_read(&mdsc->snap_rwsem);
> > > >  		ret = VM_FAULT_LOCKED;
> > > > +		wait_on_stable_page_write(page);
> > > >  	} else {
> > > >  		if (ret == -ENOMEM)
> > > >  			ret = VM_FAULT_OOM;
> > > > diff --git a/fs/cifs/file.c b/fs/cifs/file.c
> > > 
> > > Cifs also self-BDI network filesystem, but
> > > 
> > > > index edb25b4..a8770bf 100644
> > > > --- a/fs/cifs/file.c
> > > > +++ b/fs/cifs/file.c
> > > > @@ -2997,6 +2997,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
> > > >  	struct page *page = vmf->page;
> > > >  
> > > >  	lock_page(page);
> > > 
> > > It waits by locking the page, that's cifs naive way of waiting for writeback
> > > 
> > > > +	wait_on_stable_page_write(page);
> > > 
> > > Instead it could do better and not override page_mkwrite at all, and all it needs
> > > to do is call bdi_require_stable_pages() at it's own registered BDI
> > > 
> > 
> > Hmm...I don't know...
> > 
> > I've never been crazy about using the page lock for this, but in the
> > absence of a better way to guarantee stable pages, it was what I ended
> > up with at the time. cifs_writepages will hold the page lock until
> > kernel_sendmsg returns. At that point the TCP layer will have copied
> > off the page data so it's safe to release it.
> > 
> > With this change though, we're going to end up blocking until the
> > writeback flag clears, right? And I think that will happen when the
> > reply comes in? So, we'll end up blocking for much longer than is
> > really necessary in page_mkwrite with this change.
> 
> That's a very good point to make-- network FSes can stop the stable-waiting
> after the request is sent.

Well, it depends...

If the fs in question uses kernel_sendpage (or the equivalent) then the
page will be inlined into the fraglist of the skb. If you use that,
then you can't just drop it after the send. It's also possible that the
fs doesn't care about page stability at all (like if signatures aren't
being used).

So I think you probably need to account for several different
possibilities of "page stability lifetimes" here...

> Can I interest you in a new page flag (PG_stable)
> that indicates when a page has to be held for stable write?  Along with a
> modification to wait_on_stable_page_write that uses the new PG_stable flag
> instead of just writeback?  Then, you can clear PG_stable right after the
> sendmsg() and release the page for further activity without having to overload
> the page lock.
> 
> I wrote a patch that does exactly that as part of my work to defer the
> integrity checksumming until the last possible instant.  However, I haven't
> gotten that part to work yet, so I left the PG_stable patch out of this
> submission.  On the other hand, it sounds like you could use it.
> 

That sounds much more suitable for CIFS and possibly for others too.
diff mbox

Patch

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index c2483e9..aa253f0 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -620,6 +620,7 @@  v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	lock_page(page);
 	if (page->mapping != inode->i_mapping)
 		goto out_unlock;
+	wait_on_stable_page_write(page);
 
 	return VM_FAULT_LOCKED;
 out_unlock:
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 9aa52d9..39eb2a4 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -758,7 +758,7 @@  int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 #ifdef CONFIG_AFS_FSCACHE
 	fscache_wait_on_page_write(vnode->cache, page);
 #endif
-
+	wait_on_stable_page_write(page);
 	_leave(" = 0");
-	return 0;
+	return VM_FAULT_LOCKED;
 }
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6690269..e9734bf 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1208,6 +1208,7 @@  static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		set_page_dirty(page);
 		up_read(&mdsc->snap_rwsem);
 		ret = VM_FAULT_LOCKED;
+		wait_on_stable_page_write(page);
 	} else {
 		if (ret == -ENOMEM)
 			ret = VM_FAULT_OOM;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index edb25b4..a8770bf 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2997,6 +2997,7 @@  cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct page *page = vmf->page;
 
 	lock_page(page);
+	wait_on_stable_page_write(page);
 	return VM_FAULT_LOCKED;
 }
 
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 47a87dd..a0027b1 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -124,6 +124,7 @@  static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
 				     fsdata);
 	BUG_ON(ret != len);
 	ret = VM_FAULT_LOCKED;
+	wait_on_stable_page_write(page);
 out:
 	return ret;
 }
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5bc7781..cb0d3aa 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1522,8 +1522,8 @@  static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
 			ubifs_release_dirty_inode_budget(c, ui);
 	}
 
-	unlock_page(page);
-	return 0;
+	wait_on_stable_page_write(page);
+	return VM_FAULT_LOCKED;
 
 out_unlock:
 	unlock_page(page);