Patchwork [RFC] ext4: Don't send extra barrier during fsync if there are no dirty pages.

login
register
mail settings
Submitter Darrick J. Wong
Date April 29, 2010, 11:51 p.m.
Message ID <20100429235102.GC15607@tux1.beaverton.ibm.com>
Download mbox | patch
Permalink /patch/51366/
State New
Headers show

Comments

Darrick J. Wong - April 29, 2010, 11:51 p.m.
Hmm.  A while ago I was complaining that an evil program that calls fsync() in
a loop will send a continuous stream of write barriers to the hard disk.  Ted
theorized that it might be possible to set a flag in ext4_writepage and clear
it in ext4_sync_file; if we happen to enter ext4_sync_file and the flag isn't
set (meaning that nothing has been dirtied since the last fsync()) then we
could skip issuing the barrier.

Here's an experimental patch to do something sort of like that.  From a quick
run with blktrace, it seems to skip the redundant barriers and improves the ffsb
mail server scores.  However, I haven't done extensive power failure testing to
see how much data it can destroy.  For that matter I'm not even 100% sure it's
correct at what it aims to do.

Just throwing this out there, though.  Nothing's blown up ... yet. :P
---
Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
---

 fs/ext4/ext4.h  |    2 ++
 fs/ext4/fsync.c |    7 +++++--
 fs/ext4/inode.c |    5 +++++
 3 files changed, 12 insertions(+), 2 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mingming Cao - May 4, 2010, 12:57 a.m.
On Thu, 2010-04-29 at 16:51 -0700, Darrick J. Wong wrote:
> Hmm.  A while ago I was complaining that an evil program that calls fsync() in
> a loop will send a continuous stream of write barriers to the hard disk.  Ted
> theorized that it might be possible to set a flag in ext4_writepage and clear
> it in ext4_sync_file; if we happen to enter ext4_sync_file and the flag isn't
> set (meaning that nothing has been dirtied since the last fsync()) then we
> could skip issuing the barrier.
> 
> Here's an experimental patch to do something sort of like that.  From a quick
> run with blktrace, it seems to skip the redundant barriers and improves the ffsb
> mail server scores.  However, I haven't done extensive power failure testing to
> see how much data it can destroy.  For that matter I'm not even 100% sure it's
> correct at what it aims to do.
> 
> Just throwing this out there, though.  Nothing's blown up ... yet. :P
> ---
> Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
> ---
> 
>  fs/ext4/ext4.h  |    2 ++
>  fs/ext4/fsync.c |    7 +++++--
>  fs/ext4/inode.c |    5 +++++
>  3 files changed, 12 insertions(+), 2 deletions(-)
> 
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index bf938cf..3b70195 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1025,6 +1025,8 @@ struct ext4_sb_info {
> 
>  	/* workqueue for dio unwritten */
>  	struct workqueue_struct *dio_unwritten_wq;
> +
> +	atomic_t unflushed_writes;
>  };
> 

Just wondering is this per filesystem flag? Thought it is nicer to make
this per -inode flag, when there is no dirty data in fly for this inode
(instead of the whole fs), there is no need to call barrier in
ext4_sync_file(). 

Mingming
>  static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
> diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
> index 0d0c323..441f872 100644
> --- a/fs/ext4/fsync.c
> +++ b/fs/ext4/fsync.c
> @@ -52,7 +52,8 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
>  {
>  	struct inode *inode = dentry->d_inode;
>  	struct ext4_inode_info *ei = EXT4_I(inode);
> -	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
> +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> +	journal_t *journal = sbi->s_journal;
>  	int ret;
>  	tid_t commit_tid;
...

> @@ -102,7 +103,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
>  		    (journal->j_flags & JBD2_BARRIER))
>  			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
>  		jbd2_log_wait_commit(journal, commit_tid);
> -	} else if (journal->j_flags & JBD2_BARRIER)
> +	} else if (journal->j_flags & JBD2_BARRIER && atomic_read(&sbi->unflushed_writes)) {
> +		atomic_set(&sbi->unflushed_writes, 0);
>  		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
> +	}
>  	return ret;
>  }
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 5381802..e501abd 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -2718,6 +2718,7 @@ static int ext4_writepage(struct page *page,
>  	unsigned int len;
>  	struct buffer_head *page_bufs = NULL;
>  	struct inode *inode = page->mapping->host;
> +	struct ext4_sb_info *sbi = EXT4_SB(page->mapping->host->i_sb);
> 
>  	trace_ext4_writepage(inode, page);
>  	size = i_size_read(inode);
> @@ -2726,6 +2727,8 @@ static int ext4_writepage(struct page *page,
>  	else
>  		len = PAGE_CACHE_SIZE;
> 
> +	atomic_set(&sbi->unflushed_writes, 1);
> +
>  	if (page_has_buffers(page)) {
>  		page_bufs = page_buffers(page);
>  		if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
> @@ -2872,6 +2875,8 @@ static int ext4_da_writepages(struct address_space *mapping,
>  	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
>  		range_whole = 1;
> 
> +	atomic_set(&sbi->unflushed_writes, 1);
> +
>  	range_cyclic = wbc->range_cyclic;
>  	if (wbc->range_cyclic) {
>  		index = mapping->writeback_index;
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ric Wheeler - May 4, 2010, 2:16 p.m.
On 05/03/2010 08:57 PM, Mingming Cao wrote:
> On Thu, 2010-04-29 at 16:51 -0700, Darrick J. Wong wrote:
>    
>> Hmm.  A while ago I was complaining that an evil program that calls fsync() in
>> a loop will send a continuous stream of write barriers to the hard disk.  Ted
>> theorized that it might be possible to set a flag in ext4_writepage and clear
>> it in ext4_sync_file; if we happen to enter ext4_sync_file and the flag isn't
>> set (meaning that nothing has been dirtied since the last fsync()) then we
>> could skip issuing the barrier.
>>
>> Here's an experimental patch to do something sort of like that.  From a quick
>> run with blktrace, it seems to skip the redundant barriers and improves the ffsb
>> mail server scores.  However, I haven't done extensive power failure testing to
>> see how much data it can destroy.  For that matter I'm not even 100% sure it's
>> correct at what it aims to do.
>>
>> Just throwing this out there, though.  Nothing's blown up ... yet. :P
>> ---
>> Signed-off-by: Darrick J. Wong<djwong@us.ibm.com>
>> ---
>>
>>   fs/ext4/ext4.h  |    2 ++
>>   fs/ext4/fsync.c |    7 +++++--
>>   fs/ext4/inode.c |    5 +++++
>>   3 files changed, 12 insertions(+), 2 deletions(-)
>>
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index bf938cf..3b70195 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -1025,6 +1025,8 @@ struct ext4_sb_info {
>>
>>   	/* workqueue for dio unwritten */
>>   	struct workqueue_struct *dio_unwritten_wq;
>> +
>> +	atomic_t unflushed_writes;
>>   };
>>
>>      
> Just wondering is this per filesystem flag? Thought it is nicer to make
> this per -inode flag, when there is no dirty data in fly for this inode
> (instead of the whole fs), there is no need to call barrier in
> ext4_sync_file().
>
> Mingming
>    

Checking per inode is actually incorrect - we do not want to short cut 
the need to flush the target storage device's write cache just because a 
specific file has no dirty pages.  If a power hit occurs, having sent 
the pages from to the storage device is not sufficient.

I was thinking that it could actually be more general, specifically we 
could track the status of the write cache on the entire storage device. 
That way, any command (write, etc) to the target device would set the 
cache state to needs_flush (or whatever) and the barrier flush would 
clear it.

Probably not worth the complication...

ric


>>   static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
>> diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
>> index 0d0c323..441f872 100644
>> --- a/fs/ext4/fsync.c
>> +++ b/fs/ext4/fsync.c
>> @@ -52,7 +52,8 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
>>   {
>>   	struct inode *inode = dentry->d_inode;
>>   	struct ext4_inode_info *ei = EXT4_I(inode);
>> -	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
>> +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>> +	journal_t *journal = sbi->s_journal;
>>   	int ret;
>>   	tid_t commit_tid;
>>      
> ...
>
>    
>> @@ -102,7 +103,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
>>   		    (journal->j_flags&  JBD2_BARRIER))
>>   			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
>>   		jbd2_log_wait_commit(journal, commit_tid);
>> -	} else if (journal->j_flags&  JBD2_BARRIER)
>> +	} else if (journal->j_flags&  JBD2_BARRIER&&  atomic_read(&sbi->unflushed_writes)) {
>> +		atomic_set(&sbi->unflushed_writes, 0);
>>   		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
>> +	}
>>   	return ret;
>>   }
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 5381802..e501abd 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -2718,6 +2718,7 @@ static int ext4_writepage(struct page *page,
>>   	unsigned int len;
>>   	struct buffer_head *page_bufs = NULL;
>>   	struct inode *inode = page->mapping->host;
>> +	struct ext4_sb_info *sbi = EXT4_SB(page->mapping->host->i_sb);
>>
>>   	trace_ext4_writepage(inode, page);
>>   	size = i_size_read(inode);
>> @@ -2726,6 +2727,8 @@ static int ext4_writepage(struct page *page,
>>   	else
>>   		len = PAGE_CACHE_SIZE;
>>
>> +	atomic_set(&sbi->unflushed_writes, 1);
>> +
>>   	if (page_has_buffers(page)) {
>>   		page_bufs = page_buffers(page);
>>   		if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
>> @@ -2872,6 +2875,8 @@ static int ext4_da_writepages(struct address_space *mapping,
>>   	if (wbc->range_start == 0&&  wbc->range_end == LLONG_MAX)
>>   		range_whole = 1;
>>
>> +	atomic_set(&sbi->unflushed_writes, 1);
>> +
>>   	range_cyclic = wbc->range_cyclic;
>>   	if (wbc->range_cyclic) {
>>   		index = mapping->writeback_index;
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>      
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig - May 4, 2010, 3:45 p.m.
On Tue, May 04, 2010 at 10:16:37AM -0400, Ric Wheeler wrote:
> Checking per inode is actually incorrect - we do not want to short cut  
> the need to flush the target storage device's write cache just because a  
> specific file has no dirty pages.  If a power hit occurs, having sent  
> the pages from to the storage device is not sufficient.

As long as we're only using the information for fsync doing it per inode
is the correct thing.  We only want to flush the cache if the inode
(data or metadata) is dirty in some way.  Note that this includes writes
via O_DIRECT which are quite different to track - I've not found the
original patch in my mbox so I can't comment if this is done right.

It might be good idea to track this information directly in the
writeback/direct I/O code so that we don't have to reimplement it for
every filesystems, btw.

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mingming Cao - May 4, 2010, 7:49 p.m.
On Tue, 2010-05-04 at 10:16 -0400, Ric Wheeler wrote:
> On 05/03/2010 08:57 PM, Mingming Cao wrote:
> > On Thu, 2010-04-29 at 16:51 -0700, Darrick J. Wong wrote:
> >    
> >> Hmm.  A while ago I was complaining that an evil program that calls fsync() in
> >> a loop will send a continuous stream of write barriers to the hard disk.  Ted
> >> theorized that it might be possible to set a flag in ext4_writepage and clear
> >> it in ext4_sync_file; if we happen to enter ext4_sync_file and the flag isn't
> >> set (meaning that nothing has been dirtied since the last fsync()) then we
> >> could skip issuing the barrier.
> >>
> >> Here's an experimental patch to do something sort of like that.  From a quick
> >> run with blktrace, it seems to skip the redundant barriers and improves the ffsb
> >> mail server scores.  However, I haven't done extensive power failure testing to
> >> see how much data it can destroy.  For that matter I'm not even 100% sure it's
> >> correct at what it aims to do.
> >>
> >> Just throwing this out there, though.  Nothing's blown up ... yet. :P
> >> ---
> >> Signed-off-by: Darrick J. Wong<djwong@us.ibm.com>
> >> ---
> >>
> >>   fs/ext4/ext4.h  |    2 ++
> >>   fs/ext4/fsync.c |    7 +++++--
> >>   fs/ext4/inode.c |    5 +++++
> >>   3 files changed, 12 insertions(+), 2 deletions(-)
> >>
> >>
> >> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> >> index bf938cf..3b70195 100644
> >> --- a/fs/ext4/ext4.h
> >> +++ b/fs/ext4/ext4.h
> >> @@ -1025,6 +1025,8 @@ struct ext4_sb_info {
> >>
> >>   	/* workqueue for dio unwritten */
> >>   	struct workqueue_struct *dio_unwritten_wq;
> >> +
> >> +	atomic_t unflushed_writes;
> >>   };
> >>
> >>      
> > Just wondering is this per filesystem flag? Thought it is nicer to make
> > this per -inode flag, when there is no dirty data in fly for this inode
> > (instead of the whole fs), there is no need to call barrier in
> > ext4_sync_file().
> >
> > Mingming
> >    
> 
> Checking per inode is actually incorrect - we do not want to short cut 
> the need to flush the target storage device's write cache just because a 
> specific file has no dirty pages.  If a power hit occurs, having sent 
> the pages from to the storage device is not sufficient.
> 

hmm... My understanding is ext3/4 implementation of fsync syncing the
whole filesystem, as a jbd2 transacation could including metadata update
from other files, jbd2 has to commit the latest transactions.  But the
caller is fsync(), which should only need to ensure the specified
inode's dirty data/metadata gets to disk by sending barriers down.

Mingming

> I was thinking that it could actually be more general, specifically we 
> could track the status of the write cache on the entire storage device. 
> That way, any command (write, etc) to the target device would set the 
> cache state to needs_flush (or whatever) and the barrier flush would 
> clear it.
> 
> Probably not worth the complication...
> 
> ric
> 
> 
> >>   static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
> >> diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
> >> index 0d0c323..441f872 100644
> >> --- a/fs/ext4/fsync.c
> >> +++ b/fs/ext4/fsync.c
> >> @@ -52,7 +52,8 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
> >>   {
> >>   	struct inode *inode = dentry->d_inode;
> >>   	struct ext4_inode_info *ei = EXT4_I(inode);
> >> -	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
> >> +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> >> +	journal_t *journal = sbi->s_journal;
> >>   	int ret;
> >>   	tid_t commit_tid;
> >>      
> > ...
> >
> >    
> >> @@ -102,7 +103,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
> >>   		    (journal->j_flags&  JBD2_BARRIER))
> >>   			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
> >>   		jbd2_log_wait_commit(journal, commit_tid);
> >> -	} else if (journal->j_flags&  JBD2_BARRIER)
> >> +	} else if (journal->j_flags&  JBD2_BARRIER&&  atomic_read(&sbi->unflushed_writes)) {
> >> +		atomic_set(&sbi->unflushed_writes, 0);
> >>   		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
> >> +	}
> >>   	return ret;
> >>   }
> >> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> >> index 5381802..e501abd 100644
> >> --- a/fs/ext4/inode.c
> >> +++ b/fs/ext4/inode.c
> >> @@ -2718,6 +2718,7 @@ static int ext4_writepage(struct page *page,
> >>   	unsigned int len;
> >>   	struct buffer_head *page_bufs = NULL;
> >>   	struct inode *inode = page->mapping->host;
> >> +	struct ext4_sb_info *sbi = EXT4_SB(page->mapping->host->i_sb);
> >>
> >>   	trace_ext4_writepage(inode, page);
> >>   	size = i_size_read(inode);
> >> @@ -2726,6 +2727,8 @@ static int ext4_writepage(struct page *page,
> >>   	else
> >>   		len = PAGE_CACHE_SIZE;
> >>
> >> +	atomic_set(&sbi->unflushed_writes, 1);
> >> +
> >>   	if (page_has_buffers(page)) {
> >>   		page_bufs = page_buffers(page);
> >>   		if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
> >> @@ -2872,6 +2875,8 @@ static int ext4_da_writepages(struct address_space *mapping,
> >>   	if (wbc->range_start == 0&&  wbc->range_end == LLONG_MAX)
> >>   		range_whole = 1;
> >>
> >> +	atomic_set(&sbi->unflushed_writes, 1);
> >> +
> >>   	range_cyclic = wbc->range_cyclic;
> >>   	if (wbc->range_cyclic) {
> >>   		index = mapping->writeback_index;
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >>      
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >    
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o - June 30, 2010, 12:48 p.m.
On Tue, May 04, 2010 at 11:45:53AM -0400, Christoph Hellwig wrote:
> On Tue, May 04, 2010 at 10:16:37AM -0400, Ric Wheeler wrote:
> > Checking per inode is actually incorrect - we do not want to short cut  
> > the need to flush the target storage device's write cache just because a  
> > specific file has no dirty pages.  If a power hit occurs, having sent  
> > the pages from to the storage device is not sufficient.
> 
> As long as we're only using the information for fsync doing it per inode
> is the correct thing.  We only want to flush the cache if the inode
> (data or metadata) is dirty in some way.  Note that this includes writes
> via O_DIRECT which are quite different to track - I've not found the
> original patch in my mbox so I can't comment if this is done right.

I agree.

I wonder if it's worthwhile to think about a new system call which
allows users to provide an array of fd's which are collectively should
be fsync'ed out at the same time.  Otherwise, we end up issuing
multiple barrier operations in cases where the application needs to
do:

	fsync(control_fd);
	fsync(data_fd);

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ric Wheeler - June 30, 2010, 1:21 p.m.
On 06/30/2010 08:48 AM, tytso@mit.edu wrote:
> On Tue, May 04, 2010 at 11:45:53AM -0400, Christoph Hellwig wrote:
>> On Tue, May 04, 2010 at 10:16:37AM -0400, Ric Wheeler wrote:
>>> Checking per inode is actually incorrect - we do not want to short cut
>>> the need to flush the target storage device's write cache just because a
>>> specific file has no dirty pages.  If a power hit occurs, having sent
>>> the pages from to the storage device is not sufficient.
>>
>> As long as we're only using the information for fsync doing it per inode
>> is the correct thing.  We only want to flush the cache if the inode
>> (data or metadata) is dirty in some way.  Note that this includes writes
>> via O_DIRECT which are quite different to track - I've not found the
>> original patch in my mbox so I can't comment if this is done right.
>
> I agree.
>
> I wonder if it's worthwhile to think about a new system call which
> allows users to provide an array of fd's which are collectively should
> be fsync'ed out at the same time.  Otherwise, we end up issuing
> multiple barrier operations in cases where the application needs to
> do:
>
> 	fsync(control_fd);
> 	fsync(data_fd);
>
> 						- Ted

The problem with not issuing a cache flush when you have dirty meta data or data 
is that it does not have any tie to the state of the volatile write cache of the 
target storage device.

We do need to have fsync() issue the cache flush command even when there is no 
dirty state for the inode in our local page cache in order to flush data that 
was pushed out/cleaned and not followed by a flush.

It would definitely be *very* useful to have an array of fd's that all need 
fsync()'ed at home time....

Ric

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o - June 30, 2010, 1:44 p.m.
On Wed, Jun 30, 2010 at 09:21:04AM -0400, Ric Wheeler wrote:
> 
> The problem with not issuing a cache flush when you have dirty meta
> data or data is that it does not have any tie to the state of the
> volatile write cache of the target storage device.

We track whether or not there is any metadata updates associated with
the inode already; if it does, we force a journal commit, and this
implies a barrier operation.

The case we're talking about here is one where either (a) there is no
journal, or (b) there have been no metadata updates (I'm simplifying a
little here; in fact we track whether there have been fdatasync()- vs
fsync()- worthy metadata updates), and so there hasn't been a journal
commit to do the cache flush.

In this case, we want to track when is the last time an fsync() has
been issued, versus when was the last time data blocks for a
particular inode have been pushed out to disk.

To use an example I used as motivation for why we might want an
fsync2(int fd[], int flags[], int num) syscall, consider the situation
of:

	fsync(control_fd);
	fdatasync(data_fd);

The first fsync() will have executed a cache flush operation.  So when
we do the fdatasync() (assuming that no metadata needs to be flushed
out to disk), there is no need for the cache flush operation.

If we had an enhanced fsync command, we would also be able to
eliminate a second journal commit in the case where data_fd also had
some metadata that needed to be flushed out to disk.

> It would definitely be *very* useful to have an array of fd's that
> all need fsync()'ed at home time....

Yes, but it would require applications to change their code.

One thing that I would like about a new fsync2() system call is with a
flags field, we could add some new, more expressive flags:

#define FSYNC_DATA    0x0001 /* Only flush metadata if needed to access data */
#define FSYNC_NOWAIT  0x0002 /* Initiate the flush operations but don't wait
		      	        for them to complete */
#define FSYNC_NOBARRER 0x004 /* FS may skip the barrier if not needed for fs
		       	     	consistency */

etc.

					- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ric Wheeler - June 30, 2010, 1:54 p.m.
On 06/30/2010 09:44 AM, tytso@mit.edu wrote:
> On Wed, Jun 30, 2010 at 09:21:04AM -0400, Ric Wheeler wrote:
>>
>> The problem with not issuing a cache flush when you have dirty meta
>> data or data is that it does not have any tie to the state of the
>> volatile write cache of the target storage device.
>
> We track whether or not there is any metadata updates associated with
> the inode already; if it does, we force a journal commit, and this
> implies a barrier operation.
>
> The case we're talking about here is one where either (a) there is no
> journal, or (b) there have been no metadata updates (I'm simplifying a
> little here; in fact we track whether there have been fdatasync()- vs
> fsync()- worthy metadata updates), and so there hasn't been a journal
> commit to do the cache flush.
>
> In this case, we want to track when is the last time an fsync() has
> been issued, versus when was the last time data blocks for a
> particular inode have been pushed out to disk.

I think that the state that we want to track is the last time the write cache on 
the target device has been flushed. If the last fsync() did do a full barrier, 
that would be equivalent :-)

ric

>
> To use an example I used as motivation for why we might want an
> fsync2(int fd[], int flags[], int num) syscall, consider the situation
> of:
>
> 	fsync(control_fd);
> 	fdatasync(data_fd);
>
> The first fsync() will have executed a cache flush operation.  So when
> we do the fdatasync() (assuming that no metadata needs to be flushed
> out to disk), there is no need for the cache flush operation.
>
> If we had an enhanced fsync command, we would also be able to
> eliminate a second journal commit in the case where data_fd also had
> some metadata that needed to be flushed out to disk.
>
>> It would definitely be *very* useful to have an array of fd's that
>> all need fsync()'ed at home time....
>
> Yes, but it would require applications to change their code.
>
> One thing that I would like about a new fsync2() system call is with a
> flags field, we could add some new, more expressive flags:
>
> #define FSYNC_DATA    0x0001 /* Only flush metadata if needed to access data */
> #define FSYNC_NOWAIT  0x0002 /* Initiate the flush operations but don't wait
> 		      	        for them to complete */
> #define FSYNC_NOBARRER 0x004 /* FS may skip the barrier if not needed for fs
> 		       	     	consistency */
>
> etc.
>
> 					- Ted

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger - June 30, 2010, 7:05 p.m.
On 2010-06-30, at 07:54, Ric Wheeler wrote:
> On 06/30/2010 09:44 AM, tytso@mit.edu wrote:
>> We track whether or not there is any metadata updates associated with
>> the inode already; if it does, we force a journal commit, and this
>> implies a barrier operation.
>> 
>> The case we're talking about here is one where either (a) there is no
>> journal, or (b) there have been no metadata updates (I'm simplifying a
>> little here; in fact we track whether there have been fdatasync()- vs
>> fsync()- worthy metadata updates), and so there hasn't been a journal
>> commit to do the cache flush.
>> 
>> In this case, we want to track when is the last time an fsync() has
>> been issued, versus when was the last time data blocks for a
>> particular inode have been pushed out to disk.
> 
> I think that the state that we want to track is the last time the write cache on the target device has been flushed. If the last fsync() did do a full barrier, that would be equivalent :-)

We had a similar problem in Lustre, where we want to ensure the integrity of some data on disk, but don't want to force an extra journal commit/barrier if there was already one since the time the write was submitted and before we need it to be on disk.

We fixed this in a similar manner but it is optimized somewhat.  In your case there is a flag on the inode in question, but you should also registered a journal commit callback after the IO has been submitted that clears the flag when the journal commits (which also implies a barrier).  This avoids a gratuitous barrier if fsync() is called on this (or any other similarly marked) inode after the journal has already issued the barrier.

The best part is that this gives "POSIXly correct" semantics for applications that are issuing the f{,data}sync() on the modified files, without penalizing them again if the journal happened to do this already in the background in aggregate.

Cheers, Andreas





--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity - Aug. 3, 2010, 1:24 p.m.
On 06/30/2010 03:48 PM, tytso@mit.edu wrote:
>
> I wonder if it's worthwhile to think about a new system call which
> allows users to provide an array of fd's which are collectively should
> be fsync'ed out at the same time.  Otherwise, we end up issuing
> multiple barrier operations in cases where the application needs to
> do:
>
> 	fsync(control_fd);
> 	fsync(data_fd);
>

The system call exists, it's called io_submit().
Theodore Ts'o - Aug. 4, 2010, 11:32 p.m.
On Tue, Aug 03, 2010 at 04:24:49PM +0300, Avi Kivity wrote:
>  On 06/30/2010 03:48 PM, tytso@mit.edu wrote:
> >
> >I wonder if it's worthwhile to think about a new system call which
> >allows users to provide an array of fd's which are collectively should
> >be fsync'ed out at the same time.  Otherwise, we end up issuing
> >multiple barrier operations in cases where the application needs to
> >do:
> >
> >	fsync(control_fd);
> >	fsync(data_fd);
> >
> 
> The system call exists, it's called io_submit().

Um, not the same thing at all.

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity - Aug. 5, 2010, 2:20 a.m.
On 08/05/2010 02:32 AM, Ted Ts'o wrote:
> On Tue, Aug 03, 2010 at 04:24:49PM +0300, Avi Kivity wrote:
>>   On 06/30/2010 03:48 PM, tytso@mit.edu wrote:
>>> I wonder if it's worthwhile to think about a new system call which
>>> allows users to provide an array of fd's which are collectively should
>>> be fsync'ed out at the same time.  Otherwise, we end up issuing
>>> multiple barrier operations in cases where the application needs to
>>> do:
>>>
>>> 	fsync(control_fd);
>>> 	fsync(data_fd);
>>>
>> The system call exists, it's called io_submit().
> Um, not the same thing at all.

Why not?  To be clear, I'm talking about an io_submit() with multiple 
IO_CMD_FSYNC requests, with a kernel implementation that is able to 
batch these requests.
Theodore Ts'o - Aug. 5, 2010, 4:17 p.m.
On Thu, Aug 05, 2010 at 05:20:12AM +0300, Avi Kivity wrote:
> 
> Why not?  To be clear, I'm talking about an io_submit() with
> multiple IO_CMD_FSYNC requests, with a kernel implementation that is
> able to batch these requests.

IO_CMD_FSYNC doesn't exist right now, but sure, it means we don't have
to add a new syscall.  I find the aio interface to be horribly
complicated, and it would mean that programs would have to link
against libaio, which again isn't my favorite set of interfaces.  

All of that being said, I do agree that adding a new IO_CMD_FSYNC,
IO_CMD_FSYNCDATA, IO_CMD_FSYNC_NOBARRIER, and
IOCMD_FSYNC_DATA_NOBARRIER would be the simplist thing to do from a
kernel implementation perspective.

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Moyer - Aug. 5, 2010, 7:13 p.m.
"Ted Ts'o" <tytso@mit.edu> writes:

> On Thu, Aug 05, 2010 at 05:20:12AM +0300, Avi Kivity wrote:
>> 
>> Why not?  To be clear, I'm talking about an io_submit() with
>> multiple IO_CMD_FSYNC requests, with a kernel implementation that is
>> able to batch these requests.
>
> IO_CMD_FSYNC doesn't exist right now, but sure, it means we don't have

Well, there's IOCB_CMD_FSYNC.  But still, this isn't the same thing as
what's requested.  If I understand correctly, what is requested is a
mechanism to flush out all data for multiple file descriptors and follow
that with a single barrier/flush (and yes, Ted did give a summary of the
commands that would be required to accomplish that).

There still remains the question of why this should be tied to the AIO
submission interface.


Cheers,
Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o - Aug. 5, 2010, 8:39 p.m.
On Thu, Aug 05, 2010 at 03:13:44PM -0400, Jeff Moyer wrote:
> > IO_CMD_FSYNC doesn't exist right now, but sure, it means we don't have
> 
> Well, there's IOCB_CMD_FSYNC.  But still, this isn't the same thing as
> what's requested.  If I understand correctly, what is requested is a
> mechanism to flush out all data for multiple file descriptors and follow
> that with a single barrier/flush (and yes, Ted did give a summary of the
> commands that would be required to accomplish that).
> 
> There still remains the question of why this should be tied to the AIO
> submission interface.

I don't think it should, personally.  The only excuse might be if
someone wanted to do an asynchronous fsync(), but I don't think that
makes sense in most cases.

					- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Moyer - Aug. 5, 2010, 8:44 p.m.
"Ted Ts'o" <tytso@mit.edu> writes:

> On Thu, Aug 05, 2010 at 03:13:44PM -0400, Jeff Moyer wrote:
>> > IO_CMD_FSYNC doesn't exist right now, but sure, it means we don't have
>> 
>> Well, there's IOCB_CMD_FSYNC.  But still, this isn't the same thing as
>> what's requested.  If I understand correctly, what is requested is a
>> mechanism to flush out all data for multiple file descriptors and follow
>> that with a single barrier/flush (and yes, Ted did give a summary of the
>> commands that would be required to accomplish that).
>> 
>> There still remains the question of why this should be tied to the AIO
>> submission interface.
>
> I don't think it should, personally.  The only excuse might be if
> someone wanted to do an asynchronous fsync(), but I don't think that
> makes sense in most cases.

In case it wasn't clear, we are in agreement on this.

Cheers,
Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf..3b70195 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1025,6 +1025,8 @@  struct ext4_sb_info {
 
 	/* workqueue for dio unwritten */
 	struct workqueue_struct *dio_unwritten_wq;
+
+	atomic_t unflushed_writes;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0d0c323..441f872 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -52,7 +52,8 @@  int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	journal_t *journal = sbi->s_journal;
 	int ret;
 	tid_t commit_tid;
 
@@ -102,7 +103,9 @@  int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		    (journal->j_flags & JBD2_BARRIER))
 			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 		jbd2_log_wait_commit(journal, commit_tid);
-	} else if (journal->j_flags & JBD2_BARRIER)
+	} else if (journal->j_flags & JBD2_BARRIER && atomic_read(&sbi->unflushed_writes)) {
+		atomic_set(&sbi->unflushed_writes, 0);
 		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+	}
 	return ret;
 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5381802..e501abd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2718,6 +2718,7 @@  static int ext4_writepage(struct page *page,
 	unsigned int len;
 	struct buffer_head *page_bufs = NULL;
 	struct inode *inode = page->mapping->host;
+	struct ext4_sb_info *sbi = EXT4_SB(page->mapping->host->i_sb);
 
 	trace_ext4_writepage(inode, page);
 	size = i_size_read(inode);
@@ -2726,6 +2727,8 @@  static int ext4_writepage(struct page *page,
 	else
 		len = PAGE_CACHE_SIZE;
 
+	atomic_set(&sbi->unflushed_writes, 1);
+
 	if (page_has_buffers(page)) {
 		page_bufs = page_buffers(page);
 		if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
@@ -2872,6 +2875,8 @@  static int ext4_da_writepages(struct address_space *mapping,
 	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 		range_whole = 1;
 
+	atomic_set(&sbi->unflushed_writes, 1);
+
 	range_cyclic = wbc->range_cyclic;
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index;