diff mbox series

[PATCHv5,2/9] fs/buffer.c: Add generic_buffer_fsync implementation

Message ID 7a7c48bf0a91d00f1114db2dc6b1269c25f7513b.1681639164.git.ritesh.list@gmail.com
State Not Applicable
Headers show
Series ext2: DIO to use iomap | expand

Commit Message

Ritesh Harjani (IBM) April 16, 2023, 10:08 a.m. UTC
Some of the higher layers like iomap takes inode_lock() when calling
generic_write_sync().
Also writeback already happens from other paths without inode lock,
so it's difficult to say that we really need sync_mapping_buffers() to
take any inode locking here. Having said that, let's add
generic_buffer_fsync() implementation in buffer.c with no
inode_lock/unlock() for now so that filesystems like ext2 and
ext4's nojournal mode can use it.

Ext4 when got converted to iomap for direct-io already copied it's own
variant of __generic_file_fsync() without lock. Hence let's add a helper
API and use it both in ext2 and ext4.

Later we can review other filesystems as well to see if we can make
generic_buffer_fsync() which does not take any inode_lock() as the
default path.

Tested-by: Disha Goel <disgoel@linux.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
---
 fs/buffer.c                 | 43 +++++++++++++++++++++++++++++++++++++
 include/linux/buffer_head.h |  2 ++
 2 files changed, 45 insertions(+)

Comments

Jan Kara April 17, 2023, 11:01 a.m. UTC | #1
On Sun 16-04-23 15:38:37, Ritesh Harjani (IBM) wrote:
> Some of the higher layers like iomap takes inode_lock() when calling
> generic_write_sync().
> Also writeback already happens from other paths without inode lock,
> so it's difficult to say that we really need sync_mapping_buffers() to
> take any inode locking here. Having said that, let's add
> generic_buffer_fsync() implementation in buffer.c with no
> inode_lock/unlock() for now so that filesystems like ext2 and
> ext4's nojournal mode can use it.
> 
> Ext4 when got converted to iomap for direct-io already copied it's own
> variant of __generic_file_fsync() without lock. Hence let's add a helper
> API and use it both in ext2 and ext4.
> 
> Later we can review other filesystems as well to see if we can make
> generic_buffer_fsync() which does not take any inode_lock() as the
> default path.
> 
> Tested-by: Disha Goel <disgoel@linux.ibm.com>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>

There is a problem with generic_buffer_fsync() that it does not call
blkdev_issue_flush() so the caller is responsible for doing that. That's
necessary for ext2 & ext4 so fine for now. But historically this was the
case with generic_file_fsync() as well and that led to many filesystem
forgetting to flush caches from fsync(2). What is our transition plan for
these filesystems that currently do the cache flush from
generic_file_fsync()? Do we want to eventually keep generic_file_fsync()
doing the cache flush and call generic_buffer_fsync() instead of
__generic_buffer_fsync() from it?

								Honza

> ---
>  fs/buffer.c                 | 43 +++++++++++++++++++++++++++++++++++++
>  include/linux/buffer_head.h |  2 ++
>  2 files changed, 45 insertions(+)
> 
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 9e1e2add541e..df98f1966a71 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -593,6 +593,49 @@ int sync_mapping_buffers(struct address_space *mapping)
>  }
>  EXPORT_SYMBOL(sync_mapping_buffers);
>  
> +/**
> + * generic_buffer_fsync - generic buffer fsync implementation
> + * for simple filesystems with no inode lock
> + *
> + * @file:	file to synchronize
> + * @start:	start offset in bytes
> + * @end:	end offset in bytes (inclusive)
> + * @datasync:	only synchronize essential metadata if true
> + *
> + * This is a generic implementation of the fsync method for simple
> + * filesystems which track all non-inode metadata in the buffers list
> + * hanging off the address_space structure.
> + */
> +int generic_buffer_fsync(struct file *file, loff_t start, loff_t end,
> +			 bool datasync)
> +{
> +	struct inode *inode = file->f_mapping->host;
> +	int err;
> +	int ret;
> +
> +	err = file_write_and_wait_range(file, start, end);
> +	if (err)
> +		return err;
> +
> +	ret = sync_mapping_buffers(inode->i_mapping);
> +	if (!(inode->i_state & I_DIRTY_ALL))
> +		goto out;
> +	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
> +		goto out;
> +
> +	err = sync_inode_metadata(inode, 1);
> +	if (ret == 0)
> +		ret = err;
> +
> +out:
> +	/* check and advance again to catch errors after syncing out buffers */
> +	err = file_check_and_advance_wb_err(file);
> +	if (ret == 0)
> +		ret = err;
> +	return ret;
> +}
> +EXPORT_SYMBOL(generic_buffer_fsync);
> +
>  /*
>   * Called when we've recently written block `bblock', and it is known that
>   * `bblock' was for a buffer_boundary() buffer.  This means that the block at
> diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
> index 8f14dca5fed7..3170d0792d52 100644
> --- a/include/linux/buffer_head.h
> +++ b/include/linux/buffer_head.h
> @@ -211,6 +211,8 @@ int inode_has_buffers(struct inode *);
>  void invalidate_inode_buffers(struct inode *);
>  int remove_inode_buffers(struct inode *inode);
>  int sync_mapping_buffers(struct address_space *mapping);
> +int generic_buffer_fsync(struct file *file, loff_t start, loff_t end,
> +			 bool datasync);
>  void clean_bdev_aliases(struct block_device *bdev, sector_t block,
>  			sector_t len);
>  static inline void clean_bdev_bh_alias(struct buffer_head *bh)
> -- 
> 2.39.2
>
Jan Kara April 17, 2023, 11:07 a.m. UTC | #2
On Mon 17-04-23 13:01:49, Jan Kara wrote:
> On Sun 16-04-23 15:38:37, Ritesh Harjani (IBM) wrote:
> > Some of the higher layers like iomap takes inode_lock() when calling
> > generic_write_sync().
> > Also writeback already happens from other paths without inode lock,
> > so it's difficult to say that we really need sync_mapping_buffers() to
> > take any inode locking here. Having said that, let's add
> > generic_buffer_fsync() implementation in buffer.c with no
> > inode_lock/unlock() for now so that filesystems like ext2 and
> > ext4's nojournal mode can use it.
> > 
> > Ext4 when got converted to iomap for direct-io already copied it's own
> > variant of __generic_file_fsync() without lock. Hence let's add a helper
> > API and use it both in ext2 and ext4.
> > 
> > Later we can review other filesystems as well to see if we can make
> > generic_buffer_fsync() which does not take any inode_lock() as the
> > default path.
> > 
> > Tested-by: Disha Goel <disgoel@linux.ibm.com>
> > Reviewed-by: Christoph Hellwig <hch@lst.de>
> > Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> 
> There is a problem with generic_buffer_fsync() that it does not call
> blkdev_issue_flush() so the caller is responsible for doing that. That's
> necessary for ext2 & ext4 so fine for now.

Actually a slight correction: ext2 could use a variant of
generic_buffer_fsync() that flushes disk caches.

								Honza
Ritesh Harjani (IBM) April 17, 2023, 11:38 a.m. UTC | #3
Jan Kara <jack@suse.cz> writes:

> On Sun 16-04-23 15:38:37, Ritesh Harjani (IBM) wrote:
>> Some of the higher layers like iomap takes inode_lock() when calling
>> generic_write_sync().
>> Also writeback already happens from other paths without inode lock,
>> so it's difficult to say that we really need sync_mapping_buffers() to
>> take any inode locking here. Having said that, let's add
>> generic_buffer_fsync() implementation in buffer.c with no
>> inode_lock/unlock() for now so that filesystems like ext2 and
>> ext4's nojournal mode can use it.
>>
>> Ext4 when got converted to iomap for direct-io already copied it's own
>> variant of __generic_file_fsync() without lock. Hence let's add a helper
>> API and use it both in ext2 and ext4.
>>
>> Later we can review other filesystems as well to see if we can make
>> generic_buffer_fsync() which does not take any inode_lock() as the
>> default path.
>>
>> Tested-by: Disha Goel <disgoel@linux.ibm.com>
>> Reviewed-by: Christoph Hellwig <hch@lst.de>
>> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
>
> There is a problem with generic_buffer_fsync() that it does not call
> blkdev_issue_flush() so the caller is responsible for doing that. That's
> necessary for ext2 & ext4 so fine for now. But historically this was the
> case with generic_file_fsync() as well and that led to many filesystem
> forgetting to flush caches from fsync(2).

Ok, thanks for the details.

> What is our transition plan for
> these filesystems that currently do the cache flush from
> generic_file_fsync()? Do we want to eventually keep generic_file_fsync()
> doing the cache flush and call generic_buffer_fsync() instead of
> __generic_buffer_fsync() from it?

Frankly speaking, I was thinking we will come back to this question
maybe when we start working on those changes. At this point in time
I only looked at it from ext2 DIO changes perspective.

But since you asked, here is what I think we could do -

Rename generic_file_fsync => generic_buffers_sync() to fs/buffers.c
Then
generic_buffers_sync() {
    ret = generic_buffers_fsync()
    if (!ret)
       blkdev_issue_flush()
}

generic_buffers_fsync() is same as in this patch which does not have the
cache flush operation.
(will rename from generic_buffer_fsync() to generic_buffers_fsync())

Note: The naming is kept such that-
- sync means it will do fsync followed by cache flush.
- fsync means it will only do the file fsync

As I understand - we would eventually like to kill the
inode_lock() variants of generic_file_fsync() and __generic_file_fsync()
after auditing other filesystem code, right?

Then for now what we need is generic_buffers_sync() function which does
not take an inode_lock() and also does cache flush which is required for ext2.
And generic_buffers_fsync() which does not do any cache flush operations
required by filesystem like ext4.

Does that sound good to you? Is the naming also proper?

Is yes, then I can rename the below function to generic_buffers_fsync()
and also create implementation of generic_buffers_sync().
Then let ext2 and ext4 use them.


-ritesh


>
> 								Honza
>
>> ---
>>  fs/buffer.c                 | 43 +++++++++++++++++++++++++++++++++++++
>>  include/linux/buffer_head.h |  2 ++
>>  2 files changed, 45 insertions(+)
>>
>> diff --git a/fs/buffer.c b/fs/buffer.c
>> index 9e1e2add541e..df98f1966a71 100644
>> --- a/fs/buffer.c
>> +++ b/fs/buffer.c
>> @@ -593,6 +593,49 @@ int sync_mapping_buffers(struct address_space *mapping)
>>  }
>>  EXPORT_SYMBOL(sync_mapping_buffers);
>>
>> +/**
>> + * generic_buffer_fsync - generic buffer fsync implementation
>> + * for simple filesystems with no inode lock
>> + *
>> + * @file:	file to synchronize
>> + * @start:	start offset in bytes
>> + * @end:	end offset in bytes (inclusive)
>> + * @datasync:	only synchronize essential metadata if true
>> + *
>> + * This is a generic implementation of the fsync method for simple
>> + * filesystems which track all non-inode metadata in the buffers list
>> + * hanging off the address_space structure.
>> + */
>> +int generic_buffer_fsync(struct file *file, loff_t start, loff_t end,
>> +			 bool datasync)
>> +{
>> +	struct inode *inode = file->f_mapping->host;
>> +	int err;
>> +	int ret;
>> +
>> +	err = file_write_and_wait_range(file, start, end);
>> +	if (err)
>> +		return err;
>> +
>> +	ret = sync_mapping_buffers(inode->i_mapping);
>> +	if (!(inode->i_state & I_DIRTY_ALL))
>> +		goto out;
>> +	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
>> +		goto out;
>> +
>> +	err = sync_inode_metadata(inode, 1);
>> +	if (ret == 0)
>> +		ret = err;
>> +
>> +out:
>> +	/* check and advance again to catch errors after syncing out buffers */
>> +	err = file_check_and_advance_wb_err(file);
>> +	if (ret == 0)
>> +		ret = err;
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL(generic_buffer_fsync);
>> +
>>  /*
>>   * Called when we've recently written block `bblock', and it is known that
>>   * `bblock' was for a buffer_boundary() buffer.  This means that the block at
>> diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
>> index 8f14dca5fed7..3170d0792d52 100644
>> --- a/include/linux/buffer_head.h
>> +++ b/include/linux/buffer_head.h
>> @@ -211,6 +211,8 @@ int inode_has_buffers(struct inode *);
>>  void invalidate_inode_buffers(struct inode *);
>>  int remove_inode_buffers(struct inode *inode);
>>  int sync_mapping_buffers(struct address_space *mapping);
>> +int generic_buffer_fsync(struct file *file, loff_t start, loff_t end,
>> +			 bool datasync);
>>  void clean_bdev_aliases(struct block_device *bdev, sector_t block,
>>  			sector_t len);
>>  static inline void clean_bdev_bh_alias(struct buffer_head *bh)
>> --
>> 2.39.2
>>
> --
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
Jan Kara April 17, 2023, 4:45 p.m. UTC | #4
On Mon 17-04-23 17:08:57, Ritesh Harjani wrote:
> Jan Kara <jack@suse.cz> writes:
> 
> > On Sun 16-04-23 15:38:37, Ritesh Harjani (IBM) wrote:
> >> Some of the higher layers like iomap takes inode_lock() when calling
> >> generic_write_sync().
> >> Also writeback already happens from other paths without inode lock,
> >> so it's difficult to say that we really need sync_mapping_buffers() to
> >> take any inode locking here. Having said that, let's add
> >> generic_buffer_fsync() implementation in buffer.c with no
> >> inode_lock/unlock() for now so that filesystems like ext2 and
> >> ext4's nojournal mode can use it.
> >>
> >> Ext4 when got converted to iomap for direct-io already copied it's own
> >> variant of __generic_file_fsync() without lock. Hence let's add a helper
> >> API and use it both in ext2 and ext4.
> >>
> >> Later we can review other filesystems as well to see if we can make
> >> generic_buffer_fsync() which does not take any inode_lock() as the
> >> default path.
> >>
> >> Tested-by: Disha Goel <disgoel@linux.ibm.com>
> >> Reviewed-by: Christoph Hellwig <hch@lst.de>
> >> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> >
> > There is a problem with generic_buffer_fsync() that it does not call
> > blkdev_issue_flush() so the caller is responsible for doing that. That's
> > necessary for ext2 & ext4 so fine for now. But historically this was the
> > case with generic_file_fsync() as well and that led to many filesystem
> > forgetting to flush caches from fsync(2).
> 
> Ok, thanks for the details.
> 
> > What is our transition plan for
> > these filesystems that currently do the cache flush from
> > generic_file_fsync()? Do we want to eventually keep generic_file_fsync()
> > doing the cache flush and call generic_buffer_fsync() instead of
> > __generic_buffer_fsync() from it?
> 
> Frankly speaking, I was thinking we will come back to this question
> maybe when we start working on those changes. At this point in time
> I only looked at it from ext2 DIO changes perspective.

Yes, we can return to this later. The only thing I wanted to kind of make
sure is we don't have to rename the function again when adding support for
other filesystems (although even that would not be a big issue given there
are two callers).

> But since you asked, here is what I think we could do -
> 
> Rename generic_file_fsync => generic_buffers_sync() to fs/buffers.c
> Then
> generic_buffers_sync() {
>     ret = generic_buffers_fsync()
>     if (!ret)
>        blkdev_issue_flush()
> }
> 
> generic_buffers_fsync() is same as in this patch which does not have the
> cache flush operation.
> (will rename from generic_buffer_fsync() to generic_buffers_fsync())
> 
> Note: The naming is kept such that-
> - sync means it will do fsync followed by cache flush.
> - fsync means it will only do the file fsync

Hum, I think the difference sync vs fsync is too subtle and non-obvious.
I can see sensible pairs like:

	__generic_buffers_fsync() - "__" indicates you should know what you
				are doing when calling this
	generic_buffers_fsync()

or

	generic_buffers_fsync()
	generic_file_fsync() - difficult at this point as there's name
			       clash

or

	generic_buffers_fsync_noflush()
	generic_buffers_fsync() - obvious what the default "safe" choice
				  is.

or something like that.

> As I understand - we would eventually like to kill the
> inode_lock() variants of generic_file_fsync() and __generic_file_fsync()
> after auditing other filesystem code, right?

Yes.

> Then for now what we need is generic_buffers_sync() function which does
> not take an inode_lock() and also does cache flush which is required for ext2.
> And generic_buffers_fsync() which does not do any cache flush operations
> required by filesystem like ext4.
> 
> Does that sound good to you? Is the naming also proper?

I agree with the plan, just the naming is hard :)

								Honza
Christoph Hellwig April 18, 2023, 5:04 a.m. UTC | #5
On Mon, Apr 17, 2023 at 06:45:50PM +0200, Jan Kara wrote:
> Hum, I think the difference sync vs fsync is too subtle and non-obvious.

Agreed.

> I can see sensible pairs like:
> 
> 	__generic_buffers_fsync() - "__" indicates you should know what you
> 				are doing when calling this
> 	generic_buffers_fsync()
> 
> or
> 
> 	generic_buffers_fsync()
> 	generic_file_fsync() - difficult at this point as there's name
> 			       clash
> 
> or
> 
> 	generic_buffers_fsync_noflush()
> 	generic_buffers_fsync() - obvious what the default "safe" choice
> 				  is.
> 
> or something like that.

I'd prefer the last option as the most explicit one.
Ritesh Harjani (IBM) April 20, 2023, 2:42 p.m. UTC | #6
Christoph Hellwig <hch@infradead.org> writes:

> On Mon, Apr 17, 2023 at 06:45:50PM +0200, Jan Kara wrote:
>> Hum, I think the difference sync vs fsync is too subtle and non-obvious.
>
> Agreed.
>
>> I can see sensible pairs like:
>>
>> 	__generic_buffers_fsync() - "__" indicates you should know what you
>> 				are doing when calling this
>> 	generic_buffers_fsync()
>>
>> or
>>
>> 	generic_buffers_fsync()
>> 	generic_file_fsync() - difficult at this point as there's name
>> 			       clash
>>
>> or
>>
>> 	generic_buffers_fsync_noflush()
>> 	generic_buffers_fsync() - obvious what the default "safe" choice
>> 				  is.
>>
>> or something like that.
>
> I'd prefer the last option as the most explicit one.

Yes. I was going to use this one as this is more explicit.

Thanks Jan & Christoph,
I will spin a new revision soon with the suggested changes.

-ritesh
diff mbox series

Patch

diff --git a/fs/buffer.c b/fs/buffer.c
index 9e1e2add541e..df98f1966a71 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -593,6 +593,49 @@  int sync_mapping_buffers(struct address_space *mapping)
 }
 EXPORT_SYMBOL(sync_mapping_buffers);
 
+/**
+ * generic_buffer_fsync - generic buffer fsync implementation
+ * for simple filesystems with no inode lock
+ *
+ * @file:	file to synchronize
+ * @start:	start offset in bytes
+ * @end:	end offset in bytes (inclusive)
+ * @datasync:	only synchronize essential metadata if true
+ *
+ * This is a generic implementation of the fsync method for simple
+ * filesystems which track all non-inode metadata in the buffers list
+ * hanging off the address_space structure.
+ */
+int generic_buffer_fsync(struct file *file, loff_t start, loff_t end,
+			 bool datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	int err;
+	int ret;
+
+	err = file_write_and_wait_range(file, start, end);
+	if (err)
+		return err;
+
+	ret = sync_mapping_buffers(inode->i_mapping);
+	if (!(inode->i_state & I_DIRTY_ALL))
+		goto out;
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		goto out;
+
+	err = sync_inode_metadata(inode, 1);
+	if (ret == 0)
+		ret = err;
+
+out:
+	/* check and advance again to catch errors after syncing out buffers */
+	err = file_check_and_advance_wb_err(file);
+	if (ret == 0)
+		ret = err;
+	return ret;
+}
+EXPORT_SYMBOL(generic_buffer_fsync);
+
 /*
  * Called when we've recently written block `bblock', and it is known that
  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 8f14dca5fed7..3170d0792d52 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -211,6 +211,8 @@  int inode_has_buffers(struct inode *);
 void invalidate_inode_buffers(struct inode *);
 int remove_inode_buffers(struct inode *inode);
 int sync_mapping_buffers(struct address_space *mapping);
+int generic_buffer_fsync(struct file *file, loff_t start, loff_t end,
+			 bool datasync);
 void clean_bdev_aliases(struct block_device *bdev, sector_t block,
 			sector_t len);
 static inline void clean_bdev_bh_alias(struct buffer_head *bh)