[5/7] dax, iomap: Add support for synchronous faults

Message ID 20170727131245.28279-6-jack@suse.cz
State Superseded
Headers show

Commit Message

Jan Kara July 27, 2017, 1:12 p.m.
Add a flag to iomap interface informing the caller that inode needs
fdstasync(2) for returned extent to become persistent and use it in DAX
fault code so that we map such extents only read only. We propagate the
information that the page table entry has been inserted write-protected
from dax_iomap_fault() with a new VM_FAULT_RO flag. Filesystem fault
handler is then responsible for calling fdatasync(2) and updating page
tables to map pfns read-write. dax_iomap_fault() also takes care of
updating vmf->orig_pte to match the PTE that was inserted so that we can
safely recheck that PTE did not change while write-enabling it.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/dax.c              | 42 +++++++++++++++++++++++++++++++++++-------
 include/linux/iomap.h |  2 ++
 include/linux/mm.h    |  2 ++
 3 files changed, 39 insertions(+), 7 deletions(-)

Comments

Ross Zwisler July 27, 2017, 10:42 p.m. | #1
On Thu, Jul 27, 2017 at 03:12:43PM +0200, Jan Kara wrote:
> Add a flag to iomap interface informing the caller that inode needs
> fdstasync(2) for returned extent to become persistent and use it in DAX
> fault code so that we map such extents only read only. We propagate the
> information that the page table entry has been inserted write-protected
> from dax_iomap_fault() with a new VM_FAULT_RO flag. Filesystem fault
> handler is then responsible for calling fdatasync(2) and updating page
> tables to map pfns read-write. dax_iomap_fault() also takes care of
> updating vmf->orig_pte to match the PTE that was inserted so that we can
> safely recheck that PTE did not change while write-enabling it.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>
<>
> @@ -1385,9 +1409,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, bool sync,
>  	if (iomap.offset + iomap.length < pos + PMD_SIZE)
>  		goto finish_iomap;
>  
> +	force_ro = (vmf->flags & FAULT_FLAG_WRITE) && sync &&
> +			(iomap.flags & IOMAP_F_NEEDDSYNC);

I already mentioned this in my response to your cover letter, but I think that
we can just lean really heavily on IOMAP_F_NEEDDSYNC and have that let us know
that we are doing sync faults and that the fault is a write.  That simplifies
a few things in this patch, with the above just becoming:

	force_ro = (iomap.flags & IOMAP_F_NEEDDSYNC);

> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index fa036093e76c..5085647d9f2f 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1142,6 +1142,8 @@ static inline void clear_page_pfmemalloc(struct page *page)
>  #define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
>  #define VM_FAULT_FALLBACK 0x0800	/* huge page fault failed, fall back to small */
>  #define VM_FAULT_DONE_COW   0x1000	/* ->fault has fully handled COW */
> +#define VM_FAULT_RO	0x2000		/* Write fault was handled just by
> +					 * inserting RO page table entry for DAX */

I wonder if we should name this flag something a little stronger and more
specific to its usage with respect to DAX and sync faults?  Maybe
"VM_FAULT_NEEDDSYNC" for consistency with the iomap flag?
Christoph Hellwig Aug. 1, 2017, 10:56 a.m. | #2
On Thu, Jul 27, 2017 at 04:42:45PM -0600, Ross Zwisler wrote:
> I wonder if we should name this flag something a little stronger and more
> specific to its usage with respect to DAX and sync faults?  Maybe
> "VM_FAULT_NEEDDSYNC" for consistency with the iomap flag?

Agreed.

Patch

diff --git a/fs/dax.c b/fs/dax.c
index 9658975b926a..8a6cf158c691 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -829,7 +829,7 @@  static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
 }
 
 static int dax_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
-			      loff_t pos, void *entry)
+			      loff_t pos, void *entry, bool force_ro)
 {
 	const sector_t sector = dax_iomap_sector(iomap, pos);
 	struct vm_area_struct *vma = vmf->vma;
@@ -858,7 +858,7 @@  static int dax_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
 		return PTR_ERR(ret);
 
 	trace_dax_insert_mapping(mapping->host, vmf, ret);
-	if (vmf->flags & FAULT_FLAG_WRITE)
+	if ((vmf->flags & FAULT_FLAG_WRITE) && !force_ro)
 		rc = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
 	else
 		rc = vm_insert_mixed(vma, vaddr, pfn);
@@ -870,6 +870,14 @@  static int dax_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
 	vmf_ret = dax_fault_return(rc);
 	if (iomap->flags & IOMAP_F_NEW)
 		vmf_ret |= VM_FAULT_MAJOR;
+	if (!rc && (vmf->flags & FAULT_FLAG_WRITE) && force_ro) {
+		vmf_ret |= VM_FAULT_RO;
+		/*
+		 * Hack: Store PFN here so that we can pass it to
+		 * vm_insert_mixed_mkwrite() when changing PTE to RW.
+		 */
+		vmf->orig_pte = pfn_t_pte(pfn, vma->vm_page_prot);
+	}
 	return vmf_ret;
 }
 
@@ -1092,6 +1100,7 @@  static int dax_iomap_pte_fault(struct vm_fault *vmf, bool sync,
 	int error;
 	int vmf_ret = 0;
 	void *entry;
+	bool force_ro;
 
 	trace_dax_pte_fault(inode, vmf, vmf_ret);
 	/*
@@ -1167,13 +1176,15 @@  static int dax_iomap_pte_fault(struct vm_fault *vmf, bool sync,
 		goto finish_iomap;
 	}
 
+	force_ro = (vmf->flags & FAULT_FLAG_WRITE) && sync &&
+			(iomap.flags & IOMAP_F_NEEDDSYNC);
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
 		if (iomap.flags & IOMAP_F_NEW) {
 			count_vm_event(PGMAJFAULT);
 			count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
 		}
-		vmf_ret = dax_insert_mapping(vmf, &iomap, pos, entry);
+		vmf_ret = dax_insert_mapping(vmf, &iomap, pos, entry, force_ro);
 		goto finish_iomap;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
@@ -1219,7 +1230,7 @@  static int dax_iomap_pte_fault(struct vm_fault *vmf, bool sync,
 #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
 
 static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
-		loff_t pos, void *entry)
+		loff_t pos, void *entry, bool force_ro)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	const sector_t sector = dax_iomap_sector(iomap, pos);
@@ -1232,6 +1243,7 @@  static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
 	pgoff_t pgoff;
 	pfn_t pfn;
 	int id;
+	int result;
 
 	if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
 		goto fallback;
@@ -1256,8 +1268,19 @@  static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
 		goto fallback;
 
 	trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
-	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
-			pfn, vmf->flags & FAULT_FLAG_WRITE);
+	result = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+			pfn, (vmf->flags & FAULT_FLAG_WRITE) && !force_ro);
+	/* Did we insert RO PMD despite the fault being a write one? */
+	if (!(result & VM_FAULT_ERROR) && (vmf->flags & FAULT_FLAG_WRITE) &&
+	    force_ro) {
+		result |= VM_FAULT_RO;
+		/*
+		 * Hack: Store PFN here so that we can pass it to
+		 * vmf_insert_pfn_pmd() when changing PMD to RW.
+		 */
+		vmf->orig_pte = pfn_t_pte(pfn, vmf->vma->vm_page_prot);
+	}
+	return result;
 
 unlock_fallback:
 	dax_read_unlock(id);
@@ -1320,6 +1343,7 @@  static int dax_iomap_pmd_fault(struct vm_fault *vmf, bool sync,
 	void *entry;
 	loff_t pos;
 	int error;
+	bool force_ro;
 
 	/*
 	 * Check whether offset isn't beyond end of file now. Caller is
@@ -1385,9 +1409,13 @@  static int dax_iomap_pmd_fault(struct vm_fault *vmf, bool sync,
 	if (iomap.offset + iomap.length < pos + PMD_SIZE)
 		goto finish_iomap;
 
+	force_ro = (vmf->flags & FAULT_FLAG_WRITE) && sync &&
+			(iomap.flags & IOMAP_F_NEEDDSYNC);
+
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
-		result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
+		result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry,
+						force_ro);
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index f64dc6ce5161..957463602f6e 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -22,6 +22,8 @@  struct vm_fault;
  * Flags for all iomap mappings:
  */
 #define IOMAP_F_NEW	0x01	/* blocks have been newly allocated */
+#define IOMAP_F_NEEDDSYNC	0x02	/* inode needs fdatasync for storage to
+					 * become persistent */
 
 /*
  * Flags that only need to be reported for IOMAP_REPORT requests:
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fa036093e76c..5085647d9f2f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1142,6 +1142,8 @@  static inline void clear_page_pfmemalloc(struct page *page)
 #define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
 #define VM_FAULT_FALLBACK 0x0800	/* huge page fault failed, fall back to small */
 #define VM_FAULT_DONE_COW   0x1000	/* ->fault has fully handled COW */
+#define VM_FAULT_RO	0x2000		/* Write fault was handled just by
+					 * inserting RO page table entry for DAX */
 
 #define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
 			 VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \