diff mbox series

[2/3] mm: Provide address_space operation for filling pages for read

Message ID 20210120160611.26853-3-jack@suse.cz
State New
Headers show
Series fs: Hole punch vs page cache filling races | expand

Commit Message

Jan Kara Jan. 20, 2021, 4:06 p.m. UTC
Provide an address_space operation for filling pages needed for read
into page cache. Filesystems can use this operation to seriealize
page cache filling with e.g. hole punching properly.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fs.h |  5 +++++
 mm/filemap.c       | 32 ++++++++++++++++++++++++++------
 2 files changed, 31 insertions(+), 6 deletions(-)

Comments

Christoph Hellwig Jan. 20, 2021, 4:20 p.m. UTC | #1
On Wed, Jan 20, 2021 at 05:06:10PM +0100, Jan Kara wrote:
> Provide an address_space operation for filling pages needed for read
> into page cache. Filesystems can use this operation to seriealize
> page cache filling with e.g. hole punching properly.

Besides the impending rewrite of the area - having another indirection
here is just horrible for performance.  If we want locking in this area
it should be in core code and common for multiple file systems.
Jan Kara Jan. 20, 2021, 5:27 p.m. UTC | #2
On Wed 20-01-21 16:20:01, Christoph Hellwig wrote:
> On Wed, Jan 20, 2021 at 05:06:10PM +0100, Jan Kara wrote:
> > Provide an address_space operation for filling pages needed for read
> > into page cache. Filesystems can use this operation to seriealize
> > page cache filling with e.g. hole punching properly.
> 
> Besides the impending rewrite of the area - having another indirection
> here is just horrible for performance.  If we want locking in this area
> it should be in core code and common for multiple file systems.

This would mean pulling i_mmap_sem out from ext4/XFS/F2FS private inode
into the VFS inode. Which is fine by me but it would grow struct inode for
proc / tmpfs / btrfs (although for btrfs I'm not convinced it isn't
actually prone to the race and doesn't need similar protection as xfs /
ext4) so some people may object.

								Honza
Christoph Hellwig Jan. 20, 2021, 5:28 p.m. UTC | #3
On Wed, Jan 20, 2021 at 06:27:05PM +0100, Jan Kara wrote:
> This would mean pulling i_mmap_sem out from ext4/XFS/F2FS private inode
> into the VFS inode. Which is fine by me but it would grow struct inode for
> proc / tmpfs / btrfs (although for btrfs I'm not convinced it isn't
> actually prone to the race and doesn't need similar protection as xfs /
> ext4) so some people may object.

The btrfs folks are already looking into lifting it to common code.

Also I have a patch pending to remove a list_head from the inode to
counter the size increase :)
Matthew Wilcox Jan. 20, 2021, 5:56 p.m. UTC | #4
On Wed, Jan 20, 2021 at 05:28:36PM +0000, Christoph Hellwig wrote:
> On Wed, Jan 20, 2021 at 06:27:05PM +0100, Jan Kara wrote:
> > This would mean pulling i_mmap_sem out from ext4/XFS/F2FS private inode
> > into the VFS inode. Which is fine by me but it would grow struct inode for
> > proc / tmpfs / btrfs (although for btrfs I'm not convinced it isn't
> > actually prone to the race and doesn't need similar protection as xfs /
> > ext4) so some people may object.
> 
> The btrfs folks are already looking into lifting it to common code.
> 
> Also I have a patch pending to remove a list_head from the inode to
> counter the size increase :)

We can get rid of nrexceptional as well:

https://lore.kernel.org/linux-fsdevel/20201026151849.24232-1-willy@infradead.org/

We can also reduce the size of the rwsem by one word by replacing the list_head with a single pointer.

https://git.infradead.org/users/willy/pagecache.git/shortlog/refs/heads/wlist

(haven't touched it in almost four years, seemed to have a bug last time
i looked at it).
diff mbox series

Patch

diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd47deea7c17..1d3f963d0d99 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -381,6 +381,9 @@  struct address_space_operations {
 	int (*readpages)(struct file *filp, struct address_space *mapping,
 			struct list_head *pages, unsigned nr_pages);
 	void (*readahead)(struct readahead_control *);
+	/* Fill in uptodate pages for kiocb into page cache and pagep array */
+	int (*fill_pages)(struct kiocb *, size_t len, bool partial_page,
+			  struct page **pagep, unsigned int nr_pages);
 
 	int (*write_begin)(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
@@ -2962,6 +2965,8 @@  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
 extern int generic_write_check_limits(struct file *file, loff_t pos,
 		loff_t *count);
 extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
+extern int generic_file_buffered_read_get_pages(struct kiocb *iocb, size_t len,
+		bool partial_page, struct page **pages, unsigned int nr);
 extern ssize_t generic_file_buffered_read(struct kiocb *iocb,
 		struct iov_iter *to, ssize_t already_read);
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
diff --git a/mm/filemap.c b/mm/filemap.c
index 7029bada8e90..5b594dd245e0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2333,10 +2333,24 @@  generic_file_buffered_read_no_cached_page(struct kiocb *iocb)
 	return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
 }
 
-static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
-						size_t len, bool partial_page,
-						struct page **pages,
-						unsigned int nr)
+/**
+ * generic_file_buffered_read_get_pages - generic routine to fill in page cache
+ *	pages for a read
+ * @iocb:	the iocb to read
+ * @len:	number of bytes to read
+ * @partial_page:	are partially uptodate pages accepted by read?
+ * @pages:	array where to fill in found pages
+ * @nr:		number of pages in the @pages array
+ *
+ * Fill pages into page cache and @pages array needed for a read of length @len
+ * described by @iocb.
+ *
+ * Return:
+ * Number of pages filled in the array
+ */
+int generic_file_buffered_read_get_pages(struct kiocb *iocb, size_t len,
+					 bool partial_page,
+					 struct page **pages, unsigned int nr)
 {
 	struct file *filp = iocb->ki_filp;
 	struct address_space *mapping = filp->f_mapping;
@@ -2419,6 +2433,7 @@  static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
 	 */
 	goto find_page;
 }
+EXPORT_SYMBOL(generic_file_buffered_read_get_pages);
 
 /**
  * generic_file_buffered_read - generic file read routine
@@ -2447,6 +2462,8 @@  ssize_t generic_file_buffered_read(struct kiocb *iocb,
 	unsigned int nr_pages = min_t(unsigned int, 512,
 			((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
 			(iocb->ki_pos >> PAGE_SHIFT));
+	int (*fill_pages)(struct kiocb *, size_t, bool, struct page **,
+			  unsigned int) = mapping->a_ops->fill_pages;
 	int i, pg_nr, error = 0;
 	bool writably_mapped;
 	loff_t isize, end_offset;
@@ -2456,6 +2473,9 @@  ssize_t generic_file_buffered_read(struct kiocb *iocb,
 	if (unlikely(!iov_iter_count(iter)))
 		return 0;
 
+	if (!fill_pages)
+		fill_pages = generic_file_buffered_read_get_pages;
+
 	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
 
 	if (nr_pages > ARRAY_SIZE(pages_onstack))
@@ -2478,8 +2498,8 @@  ssize_t generic_file_buffered_read(struct kiocb *iocb,
 			iocb->ki_flags |= IOCB_NOWAIT;
 
 		i = 0;
-		pg_nr = generic_file_buffered_read_get_pages(iocb, iter->count,
-				!iov_iter_is_pipe(iter), pages, nr_pages);
+		pg_nr = fill_pages(iocb, iter->count, !iov_iter_is_pipe(iter),
+				   pages, nr_pages);
 		if (pg_nr < 0) {
 			error = pg_nr;
 			break;