@@ -932,6 +932,7 @@ struct ext4_inode_info {
*/
tid_t i_sync_tid;
tid_t i_datasync_tid;
+ atomic_t i_flush_tag;
/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
__u32 i_csum_seed;
@@ -365,7 +365,15 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle,
ei->i_sync_tid = handle->h_transaction->t_tid;
if (datasync)
ei->i_datasync_tid = handle->h_transaction->t_tid;
- }
+ } else {
+ struct request_queue *q = bdev_get_queue(inode->i_sb->s_bdev);
+ if (q)
+ atomic_set(&EXT4_I(inode)->i_flush_tag,
+ atomic_read(&q->flush_tag));
+ else
+ atomic_set(&EXT4_I(inode)->i_flush_tag, UINT_MAX);
+ }
+
}
/* super.c */
@@ -116,10 +116,10 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct inode *inode = file->f_mapping->host;
struct ext4_inode_info *ei = EXT4_I(inode);
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ bool needs_barrier = journal->j_flags & JBD2_BARRIER;
+ struct request_queue *q = bdev_get_queue(inode->i_sb->s_bdev);
int ret, err;
tid_t commit_tid;
- bool needs_barrier = false;
-
J_ASSERT(ext4_journal_current_handle() == NULL);
trace_ext4_sync_file_enter(file, datasync);
@@ -163,10 +163,16 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
- if (journal->j_flags & JBD2_BARRIER &&
- !jbd2_trans_will_send_data_barrier(journal, &commit_tid))
- needs_barrier = true;
ret = jbd2_complete_transaction(journal, commit_tid);
+ /*
+ * We must send a barrier unless we can guarantee that:
+ * Latest io-requst for given inode was completed before
+ * new flush request was QUEUED and COMPLETED by blkdev.
+ */
+ if (q && ((unsigned int)atomic_read(&q->flush_tag) & ~1U)
+ > (((unsigned int)atomic_read(&ei->i_flush_tag) + 1U) & (~1U)))
+ needs_barrier = 0;
+
if (needs_barrier) {
err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (!ret)
@@ -3073,11 +3073,12 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
size);
iocb->private = NULL;
-
/* if not aio dio with unwritten extents, just free io and return */
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
ext4_free_io_end(io_end);
out:
+ if (size)
+ ext4_update_inode_fsync_trans(NULL, inode, 1);
inode_dio_done(inode);
if (is_async)
aio_complete(iocb, ret, 0);
@@ -282,7 +282,7 @@ static void ext4_end_bio(struct bio *bio, int error)
}
io_end->num_io_pages = 0;
inode = io_end->inode;
-
+ ext4_update_inode_fsync_trans(NULL, inode, 1);
if (error) {
io_end->flag |= EXT4_IO_END_ERROR;
ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
Track blkdev's flush generation counter on per-inode basis and update inside end_io. If inode's flush generation counter is older than current blkdev's flush counter inode's data was already flushed to stable media, so we can skip explicit barrier. Optimization is safe only when inode's end_io was called before flush request was QUEUED and COMPLETED. With that optimization we do not longer need jbd2 flush optimization. Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> --- fs/ext4/ext4.h | 1 + fs/ext4/ext4_jbd2.h | 10 +++++++++- fs/ext4/fsync.c | 16 +++++++++++----- fs/ext4/inode.c | 3 ++- fs/ext4/page-io.c | 2 +- 5 files changed, 24 insertions(+), 8 deletions(-)