[7/7] ext4: Support for synchronous DAX faults

Message ID 20170727131245.28279-8-jack@suse.cz
State Superseded
Headers show

Commit Message

Jan Kara July 27, 2017, 1:12 p.m.
We return IOMAP_F_NEEDDSYNC flag from ext4_iomap_begin() for a
synchronous write fault when inode has some uncommitted metadata
changes. In the fault handler ext4_dax_fault() we then detect this case,
call vfs_fsync_range() to make sure all metadata is committed, and call
dax_pfn_mkwrite() to mark PTE as writeable. Note that this will also
dirty corresponding radix tree entry which is what we want - fsync(2)
will still provide data integrity guarantees for applications not using
userspace flushing. And applications using userspace flushing can avoid
calling fsync(2) and thus avoid the performance overhead.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/file.c       | 35 +++++++++++++++++++++++++++++------
 fs/ext4/inode.c      |  4 ++++
 fs/jbd2/journal.c    | 16 ++++++++++++++++
 include/linux/jbd2.h |  1 +
 4 files changed, 50 insertions(+), 6 deletions(-)

Comments

Ross Zwisler July 27, 2017, 10:57 p.m. | #1
On Thu, Jul 27, 2017 at 03:12:45PM +0200, Jan Kara wrote:
> We return IOMAP_F_NEEDDSYNC flag from ext4_iomap_begin() for a
> synchronous write fault when inode has some uncommitted metadata
> changes. In the fault handler ext4_dax_fault() we then detect this case,
> call vfs_fsync_range() to make sure all metadata is committed, and call
> dax_pfn_mkwrite() to mark PTE as writeable. Note that this will also
> dirty corresponding radix tree entry which is what we want - fsync(2)
> will still provide data integrity guarantees for applications not using
> userspace flushing. And applications using userspace flushing can avoid
> calling fsync(2) and thus avoid the performance overhead.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/ext4/file.c       | 35 +++++++++++++++++++++++++++++------
>  fs/ext4/inode.c      |  4 ++++
>  fs/jbd2/journal.c    | 16 ++++++++++++++++
>  include/linux/jbd2.h |  1 +
>  4 files changed, 50 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index d401403e5095..b221d0b546b0 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -287,16 +287,39 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
>  		down_read(&EXT4_I(inode)->i_mmap_sem);
>  		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
>  					       EXT4_DATA_TRANS_BLOCKS(sb));
> +		if (IS_ERR(handle)) {
> +			up_read(&EXT4_I(inode)->i_mmap_sem);
> +			sb_end_pagefault(sb);
> +			return VM_FAULT_SIGBUS;
> +		}

Yay, this error handling seems cleaner to me anyway.

>  	} else {
>  		down_read(&EXT4_I(inode)->i_mmap_sem);
>  	}
> -	if (!IS_ERR(handle))
> -		result = dax_iomap_fault(vmf, pe_size, false, &ext4_iomap_ops);
> -	else
> -		result = VM_FAULT_SIGBUS;
> +	result = dax_iomap_fault(vmf, pe_size, IS_SYNC(inode), &ext4_iomap_ops);
>  	if (write) {
> -		if (!IS_ERR(handle))
> -			ext4_journal_stop(handle);
> +		ext4_journal_stop(handle);
> +		/* Write fault but PFN mapped only RO? */
> +		if (result & VM_FAULT_RO) {
> +			int err;
> +			loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
> +			size_t len = 0;
> +
> +			if (pe_size == PE_SIZE_PTE)
> +				len = PAGE_SIZE;
> +#ifdef CONFIG_FS_DAX_PMD
> +			else if (pe_size == PE_SIZE_PMD)
> +				len = HPAGE_PMD_SIZE;
> +			else
> +				WARN_ON_ONCE(1);

I think this "else WARN_ON_ONCE(1);" should live outside of the
CONFIG_FS_DAX_PMD so that we get warned in all configs if we get an
unsupported pe_size.

> +#endif
> +			WARN_ON_ONCE(!IS_SYNC(inode));
> +			err = vfs_fsync_range(vmf->vma->vm_file, start,
> +					      start + len - 1, 1);
> +			if (err)
> +				result = VM_FAULT_SIGBUS;
> +			else
> +				result = dax_pfn_mkwrite(vmf, pe_size);
> +		}
>  		up_read(&EXT4_I(inode)->i_mmap_sem);
>  		sb_end_pagefault(sb);
>  	} else {
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 3c600f02673f..e68231bb227c 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3429,6 +3429,10 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>  	}
>  
>  	iomap->flags = 0;
> +	if ((flags & IOMAP_FAULT) && (flags & IOMAP_WRITE) && IS_SYNC(inode) &&
> +	    !jbd2_transaction_committed(EXT4_SB(inode->i_sb)->s_journal,
> +					EXT4_I(inode)->i_datasync_tid))
> +		iomap->flags |= IOMAP_F_NEEDDSYNC;

Do we need to check for (flags & IOMAP_FAULT), or can we rely on the fact that
we are in ext4_iomap_begin()?

Patch

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d401403e5095..b221d0b546b0 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -287,16 +287,39 @@  static int ext4_dax_huge_fault(struct vm_fault *vmf,
 		down_read(&EXT4_I(inode)->i_mmap_sem);
 		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
 					       EXT4_DATA_TRANS_BLOCKS(sb));
+		if (IS_ERR(handle)) {
+			up_read(&EXT4_I(inode)->i_mmap_sem);
+			sb_end_pagefault(sb);
+			return VM_FAULT_SIGBUS;
+		}
 	} else {
 		down_read(&EXT4_I(inode)->i_mmap_sem);
 	}
-	if (!IS_ERR(handle))
-		result = dax_iomap_fault(vmf, pe_size, false, &ext4_iomap_ops);
-	else
-		result = VM_FAULT_SIGBUS;
+	result = dax_iomap_fault(vmf, pe_size, IS_SYNC(inode), &ext4_iomap_ops);
 	if (write) {
-		if (!IS_ERR(handle))
-			ext4_journal_stop(handle);
+		ext4_journal_stop(handle);
+		/* Write fault but PFN mapped only RO? */
+		if (result & VM_FAULT_RO) {
+			int err;
+			loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
+			size_t len = 0;
+
+			if (pe_size == PE_SIZE_PTE)
+				len = PAGE_SIZE;
+#ifdef CONFIG_FS_DAX_PMD
+			else if (pe_size == PE_SIZE_PMD)
+				len = HPAGE_PMD_SIZE;
+			else
+				WARN_ON_ONCE(1);
+#endif
+			WARN_ON_ONCE(!IS_SYNC(inode));
+			err = vfs_fsync_range(vmf->vma->vm_file, start,
+					      start + len - 1, 1);
+			if (err)
+				result = VM_FAULT_SIGBUS;
+			else
+				result = dax_pfn_mkwrite(vmf, pe_size);
+		}
 		up_read(&EXT4_I(inode)->i_mmap_sem);
 		sb_end_pagefault(sb);
 	} else {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3c600f02673f..e68231bb227c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3429,6 +3429,10 @@  static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 	}
 
 	iomap->flags = 0;
+	if ((flags & IOMAP_FAULT) && (flags & IOMAP_WRITE) && IS_SYNC(inode) &&
+	    !jbd2_transaction_committed(EXT4_SB(inode->i_sb)->s_journal,
+					EXT4_I(inode)->i_datasync_tid))
+		iomap->flags |= IOMAP_F_NEEDDSYNC;
 	bdev = inode->i_sb->s_bdev;
 	iomap->bdev = bdev;
 	if (blk_queue_dax(bdev->bd_queue))
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 7d5ef3bf3f3e..e0f436c42d67 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -738,6 +738,22 @@  int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	return err;
 }
 
+/* Return 1 when transaction with given tid has already committed. */
+int jbd2_transaction_committed(journal_t *journal, tid_t tid)
+{
+	int ret = 1;
+
+	read_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == tid)
+		ret = 0;
+	if (journal->j_committing_transaction &&
+	    journal->j_committing_transaction->t_tid == tid)
+		ret = 0;
+	read_unlock(&journal->j_state_lock);
+	return ret;
+}
+
 /*
  * When this function returns the transaction corresponding to tid
  * will be completed.  If the transaction has currently running, start
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 606b6bce3a5b..296d1e0ea87b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1367,6 +1367,7 @@  int jbd2_log_start_commit(journal_t *journal, tid_t tid);
 int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
 int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
 int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
+int jbd2_transaction_committed(journal_t *journal, tid_t tid);
 int jbd2_complete_transaction(journal_t *journal, tid_t tid);
 int jbd2_log_do_checkpoint(journal_t *journal);
 int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);