diff mbox

[v2,08/11] dax: add support for fsync/sync

Message ID 1447459610-14259-9-git-send-email-ross.zwisler@linux.intel.com
State Not Applicable, archived
Headers show

Commit Message

Ross Zwisler Nov. 14, 2015, 12:06 a.m. UTC
To properly handle fsync/msync in an efficient way DAX needs to track dirty
pages so it is able to flush them durably to media on demand.

The tracking of dirty pages is done via the radix tree in struct
address_space.  This radix tree is already used by the page writeback
infrastructure for tracking dirty pages associated with an open file, and
it already has support for exceptional (non struct page*) entries.  We
build upon these features to add exceptional entries to the radix tree for
DAX dirty PMD or PTE pages at fault time.

When called as part of the msync/fsync flush path DAX queries the radix
tree for dirty entries, flushing them and then marking the PTE or PMD page
table entries as clean.  The step of cleaning the PTE or PMD entries is
necessary so that on subsequent writes to the same page we get a new write
fault allowing us to once again dirty the DAX tag in the radix tree.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 fs/dax.c            | 140 +++++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/dax.h |   1 +
 mm/huge_memory.c    |  14 +++---
 3 files changed, 141 insertions(+), 14 deletions(-)

Comments

Dave Chinner Nov. 16, 2015, 10:58 p.m. UTC | #1
On Fri, Nov 13, 2015 at 05:06:47PM -0700, Ross Zwisler wrote:
> To properly handle fsync/msync in an efficient way DAX needs to track dirty
> pages so it is able to flush them durably to media on demand.
> 
> The tracking of dirty pages is done via the radix tree in struct
> address_space.  This radix tree is already used by the page writeback
> infrastructure for tracking dirty pages associated with an open file, and
> it already has support for exceptional (non struct page*) entries.  We
> build upon these features to add exceptional entries to the radix tree for
> DAX dirty PMD or PTE pages at fault time.
> 
> When called as part of the msync/fsync flush path DAX queries the radix
> tree for dirty entries, flushing them and then marking the PTE or PMD page
> table entries as clean.  The step of cleaning the PTE or PMD entries is
> necessary so that on subsequent writes to the same page we get a new write
> fault allowing us to once again dirty the DAX tag in the radix tree.
> 
> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> ---
>  fs/dax.c            | 140 +++++++++++++++++++++++++++++++++++++++++++++++++---
>  include/linux/dax.h |   1 +
>  mm/huge_memory.c    |  14 +++---
>  3 files changed, 141 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index 131fd35a..9ce6d1b 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -24,7 +24,9 @@
>  #include <linux/memcontrol.h>
>  #include <linux/mm.h>
>  #include <linux/mutex.h>
> +#include <linux/pagevec.h>
>  #include <linux/pmem.h>
> +#include <linux/rmap.h>
>  #include <linux/sched.h>
>  #include <linux/uio.h>
>  #include <linux/vmstat.h>
> @@ -287,6 +289,53 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh,
>  	return 0;
>  }
>  
> +static int dax_dirty_pgoff(struct address_space *mapping, unsigned long pgoff,
> +		void __pmem *addr, bool pmd_entry)
> +{
> +	struct radix_tree_root *page_tree = &mapping->page_tree;
> +	int error = 0;
> +	void *entry;
> +
> +	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
> +
> +	spin_lock_irq(&mapping->tree_lock);
> +	entry = radix_tree_lookup(page_tree, pgoff);
> +	if (addr == NULL) {
> +		if (entry)
> +			goto dirty;
> +		else {
> +			WARN(1, "DAX pfn_mkwrite failed to find an entry");
> +			goto out;
> +		}
> +	}
> +
> +	if (entry) {
> +		if (pmd_entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PTE) {
> +			radix_tree_delete(&mapping->page_tree, pgoff);
> +			mapping->nrdax--;
> +		} else
> +			goto dirty;
> +	}

Logic is pretty spagettied here. Perhaps:

	entry = radix_tree_lookup(page_tree, pgoff);
	if (entry) {
		if (!pmd_entry || RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD))
			goto dirty;
		radix_tree_delete(&mapping->page_tree, pgoff);
		mapping->nrdax--;
	} else {
		WARN_ON(!addr);
		goto out_unlock;
	}
....

> +
> +	BUG_ON(RADIX_DAX_TYPE(addr));
> +	if (pmd_entry)
> +		error = radix_tree_insert(page_tree, pgoff,
> +				RADIX_DAX_PMD_ENTRY(addr));
> +	else
> +		error = radix_tree_insert(page_tree, pgoff,
> +				RADIX_DAX_PTE_ENTRY(addr));
> +
> +	if (error)
> +		goto out;
> +
> +	mapping->nrdax++;
> + dirty:
> +	radix_tree_tag_set(page_tree, pgoff, PAGECACHE_TAG_DIRTY);
> + out:
> +	spin_unlock_irq(&mapping->tree_lock);

label should be "out_unlock" rather "out" to indicate in the code
that we are jumping to the correct spot in the error stack...

> +			goto fallback;
>  	}
>  
>   out:
> @@ -689,15 +746,12 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
>   * dax_pfn_mkwrite - handle first write to DAX page
>   * @vma: The virtual memory area where the fault occurred
>   * @vmf: The description of the fault
> - *
>   */
>  int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> -	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
> +	struct file *file = vma->vm_file;
>  
> -	sb_start_pagefault(sb);
> -	file_update_time(vma->vm_file);
> -	sb_end_pagefault(sb);
> +	dax_dirty_pgoff(file->f_mapping, vmf->pgoff, NULL, false);
>  	return VM_FAULT_NOPAGE;

This seems wrong - it's dropping the freeze protection on fault, and
now the inode timestamp won't get updated, either.

>  }
>  EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
> @@ -772,3 +826,77 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
>  	return dax_zero_page_range(inode, from, length, get_block);
>  }
>  EXPORT_SYMBOL_GPL(dax_truncate_page);
> +
> +static void dax_sync_entry(struct address_space *mapping, pgoff_t pgoff,
> +		void *entry)
> +{

dax_writeback_pgoff() seems like a more consistent name (consider
dax_dirty_pgoff), and that we are actually doing a writeback
operation, not a "sync" operation.

> +	struct radix_tree_root *page_tree = &mapping->page_tree;
> +	int type = RADIX_DAX_TYPE(entry);
> +	size_t size;
> +
> +	BUG_ON(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD);
> +
> +	spin_lock_irq(&mapping->tree_lock);
> +	if (!radix_tree_tag_get(page_tree, pgoff, PAGECACHE_TAG_TOWRITE)) {
> +		/* another fsync thread already wrote back this entry */
> +		spin_unlock_irq(&mapping->tree_lock);
> +		return;
> +	}
> +	radix_tree_tag_clear(page_tree, pgoff, PAGECACHE_TAG_TOWRITE);
> +	radix_tree_tag_clear(page_tree, pgoff, PAGECACHE_TAG_DIRTY);
> +	spin_unlock_irq(&mapping->tree_lock);
> +
> +	if (type == RADIX_DAX_PMD)
> +		size = PMD_SIZE;
> +	else
> +		size = PAGE_SIZE;
> +
> +	wb_cache_pmem(RADIX_DAX_ADDR(entry), size);
> +	pgoff_mkclean(pgoff, mapping);

This looks racy w.r.t. another operation setting the radix tree
dirty tags. i.e. there is no locking to serialise marking the
vma/pte clean and another operation marking the radix tree dirty.

> +}
> +
> +/*
> + * Flush the mapping to the persistent domain within the byte range of (start,
> + * end). This is required by data integrity operations to ensure file data is on
> + * persistent storage prior to completion of the operation. It also requires us
> + * to clean the mappings (i.e. write -> RO) so that we'll get a new fault when
> + * the file is written to again so we have an indication that we need to flush
> + * the mapping if a data integrity operation takes place.
> + *
> + * We don't need commits to storage here - the filesystems will issue flushes
> + * appropriately at the conclusion of the data integrity operation via REQ_FUA
> + * writes or blkdev_issue_flush() commands.  This requires the DAX block device
> + * to implement persistent storage domain fencing/commits on receiving a
> + * REQ_FLUSH or REQ_FUA request so that this works as expected by the higher
> + * layers.
> + */
> +void dax_fsync(struct address_space *mapping, loff_t start, loff_t end)
> +{

dax_writeback_mapping_range()

Cheers,

Dave.
Ross Zwisler Nov. 17, 2015, 6:30 p.m. UTC | #2
On Tue, Nov 17, 2015 at 09:58:07AM +1100, Dave Chinner wrote:
> On Fri, Nov 13, 2015 at 05:06:47PM -0700, Ross Zwisler wrote:
> > To properly handle fsync/msync in an efficient way DAX needs to track dirty
> > pages so it is able to flush them durably to media on demand.
> > 
> > The tracking of dirty pages is done via the radix tree in struct
> > address_space.  This radix tree is already used by the page writeback
> > infrastructure for tracking dirty pages associated with an open file, and
> > it already has support for exceptional (non struct page*) entries.  We
> > build upon these features to add exceptional entries to the radix tree for
> > DAX dirty PMD or PTE pages at fault time.
> > 
> > When called as part of the msync/fsync flush path DAX queries the radix
> > tree for dirty entries, flushing them and then marking the PTE or PMD page
> > table entries as clean.  The step of cleaning the PTE or PMD entries is
> > necessary so that on subsequent writes to the same page we get a new write
> > fault allowing us to once again dirty the DAX tag in the radix tree.
> > 
> > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> > ---
> >  fs/dax.c            | 140 +++++++++++++++++++++++++++++++++++++++++++++++++---
> >  include/linux/dax.h |   1 +
> >  mm/huge_memory.c    |  14 +++---
> >  3 files changed, 141 insertions(+), 14 deletions(-)
> > 
> > diff --git a/fs/dax.c b/fs/dax.c
> > index 131fd35a..9ce6d1b 100644
> > --- a/fs/dax.c
> > +++ b/fs/dax.c
> > @@ -24,7 +24,9 @@
> >  #include <linux/memcontrol.h>
> >  #include <linux/mm.h>
> >  #include <linux/mutex.h>
> > +#include <linux/pagevec.h>
> >  #include <linux/pmem.h>
> > +#include <linux/rmap.h>
> >  #include <linux/sched.h>
> >  #include <linux/uio.h>
> >  #include <linux/vmstat.h>
> > @@ -287,6 +289,53 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh,
> >  	return 0;
> >  }
> >  
> > +static int dax_dirty_pgoff(struct address_space *mapping, unsigned long pgoff,
> > +		void __pmem *addr, bool pmd_entry)
> > +{
> > +	struct radix_tree_root *page_tree = &mapping->page_tree;
> > +	int error = 0;
> > +	void *entry;
> > +
> > +	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
> > +
> > +	spin_lock_irq(&mapping->tree_lock);
> > +	entry = radix_tree_lookup(page_tree, pgoff);
> > +	if (addr == NULL) {
> > +		if (entry)
> > +			goto dirty;
> > +		else {
> > +			WARN(1, "DAX pfn_mkwrite failed to find an entry");
> > +			goto out;
> > +		}
> > +	}
> > +
> > +	if (entry) {
> > +		if (pmd_entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PTE) {
> > +			radix_tree_delete(&mapping->page_tree, pgoff);
> > +			mapping->nrdax--;
> > +		} else
> > +			goto dirty;
> > +	}
> 
> Logic is pretty spagettied here. Perhaps:
> 
> 	entry = radix_tree_lookup(page_tree, pgoff);
> 	if (entry) {
> 		if (!pmd_entry || RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD))
> 			goto dirty;
> 		radix_tree_delete(&mapping->page_tree, pgoff);
> 		mapping->nrdax--;
> 	} else {
> 		WARN_ON(!addr);
> 		goto out_unlock;
> 	}
> ....

I don't think that this works because now if !entry we unconditionally goto
out_unlock without inserting a new entry.  I'll try and simplify the logic and
add some comments.

> > +
> > +	BUG_ON(RADIX_DAX_TYPE(addr));
> > +	if (pmd_entry)
> > +		error = radix_tree_insert(page_tree, pgoff,
> > +				RADIX_DAX_PMD_ENTRY(addr));
> > +	else
> > +		error = radix_tree_insert(page_tree, pgoff,
> > +				RADIX_DAX_PTE_ENTRY(addr));
> > +
> > +	if (error)
> > +		goto out;
> > +
> > +	mapping->nrdax++;
> > + dirty:
> > +	radix_tree_tag_set(page_tree, pgoff, PAGECACHE_TAG_DIRTY);
> > + out:
> > +	spin_unlock_irq(&mapping->tree_lock);
> 
> label should be "out_unlock" rather "out" to indicate in the code
> that we are jumping to the correct spot in the error stack...

Sure, will do.

> > +			goto fallback;
> >  	}
> >  
> >   out:
> > @@ -689,15 +746,12 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
> >   * dax_pfn_mkwrite - handle first write to DAX page
> >   * @vma: The virtual memory area where the fault occurred
> >   * @vmf: The description of the fault
> > - *
> >   */
> >  int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
> >  {
> > -	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
> > +	struct file *file = vma->vm_file;
> >  
> > -	sb_start_pagefault(sb);
> > -	file_update_time(vma->vm_file);
> > -	sb_end_pagefault(sb);
> > +	dax_dirty_pgoff(file->f_mapping, vmf->pgoff, NULL, false);
> >  	return VM_FAULT_NOPAGE;
> 
> This seems wrong - it's dropping the freeze protection on fault, and
> now the inode timestamp won't get updated, either.

Oh, that all still happens in the filesystem pfn_mkwrite code
(xfs_filemap_pfn_mkwrite() for XFS).  It needs to happen there, I think,
because we wanted to order it so that the filesystem freeze happens outside of
the XFS_MMAPLOCK_SHARED locking, as it does with the regular PMD and PTE fault
paths.

Prior to this patch set dax_pfn_mkwrite() was completely unused an was ready
to be removed as dead code - it's now being used by all filesystems just to
make sure we re-add the newly dirtied page to the radix tree dirty list.

> >  }
> >  EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
> > @@ -772,3 +826,77 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
> >  	return dax_zero_page_range(inode, from, length, get_block);
> >  }
> >  EXPORT_SYMBOL_GPL(dax_truncate_page);
> > +
> > +static void dax_sync_entry(struct address_space *mapping, pgoff_t pgoff,
> > +		void *entry)
> > +{
> 
> dax_writeback_pgoff() seems like a more consistent name (consider
> dax_dirty_pgoff), and that we are actually doing a writeback
> operation, not a "sync" operation.

Sure, I'm fine with that change.

> > +	struct radix_tree_root *page_tree = &mapping->page_tree;
> > +	int type = RADIX_DAX_TYPE(entry);
> > +	size_t size;
> > +
> > +	BUG_ON(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD);
> > +
> > +	spin_lock_irq(&mapping->tree_lock);
> > +	if (!radix_tree_tag_get(page_tree, pgoff, PAGECACHE_TAG_TOWRITE)) {
> > +		/* another fsync thread already wrote back this entry */
> > +		spin_unlock_irq(&mapping->tree_lock);
> > +		return;
> > +	}
> > +	radix_tree_tag_clear(page_tree, pgoff, PAGECACHE_TAG_TOWRITE);
> > +	radix_tree_tag_clear(page_tree, pgoff, PAGECACHE_TAG_DIRTY);
> > +	spin_unlock_irq(&mapping->tree_lock);
> > +
> > +	if (type == RADIX_DAX_PMD)
> > +		size = PMD_SIZE;
> > +	else
> > +		size = PAGE_SIZE;
> > +
> > +	wb_cache_pmem(RADIX_DAX_ADDR(entry), size);
> > +	pgoff_mkclean(pgoff, mapping);
> 
> This looks racy w.r.t. another operation setting the radix tree
> dirty tags. i.e. there is no locking to serialise marking the
> vma/pte clean and another operation marking the radix tree dirty.

I think you're right - I'll look into how to protect us from this race.  Thank
you for catching this.

> > +}
> > +
> > +/*
> > + * Flush the mapping to the persistent domain within the byte range of (start,
> > + * end). This is required by data integrity operations to ensure file data is on
> > + * persistent storage prior to completion of the operation. It also requires us
> > + * to clean the mappings (i.e. write -> RO) so that we'll get a new fault when
> > + * the file is written to again so we have an indication that we need to flush
> > + * the mapping if a data integrity operation takes place.
> > + *
> > + * We don't need commits to storage here - the filesystems will issue flushes
> > + * appropriately at the conclusion of the data integrity operation via REQ_FUA
> > + * writes or blkdev_issue_flush() commands.  This requires the DAX block device
> > + * to implement persistent storage domain fencing/commits on receiving a
> > + * REQ_FLUSH or REQ_FUA request so that this works as expected by the higher
> > + * layers.
> > + */
> > +void dax_fsync(struct address_space *mapping, loff_t start, loff_t end)
> > +{
> 
> dax_writeback_mapping_range()

Sure, I'm fine with that change.
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/dax.c b/fs/dax.c
index 131fd35a..9ce6d1b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -24,7 +24,9 @@ 
 #include <linux/memcontrol.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
+#include <linux/pagevec.h>
 #include <linux/pmem.h>
+#include <linux/rmap.h>
 #include <linux/sched.h>
 #include <linux/uio.h>
 #include <linux/vmstat.h>
@@ -287,6 +289,53 @@  static int copy_user_bh(struct page *to, struct buffer_head *bh,
 	return 0;
 }
 
+static int dax_dirty_pgoff(struct address_space *mapping, unsigned long pgoff,
+		void __pmem *addr, bool pmd_entry)
+{
+	struct radix_tree_root *page_tree = &mapping->page_tree;
+	int error = 0;
+	void *entry;
+
+	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+	spin_lock_irq(&mapping->tree_lock);
+	entry = radix_tree_lookup(page_tree, pgoff);
+	if (addr == NULL) {
+		if (entry)
+			goto dirty;
+		else {
+			WARN(1, "DAX pfn_mkwrite failed to find an entry");
+			goto out;
+		}
+	}
+
+	if (entry) {
+		if (pmd_entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PTE) {
+			radix_tree_delete(&mapping->page_tree, pgoff);
+			mapping->nrdax--;
+		} else
+			goto dirty;
+	}
+
+	BUG_ON(RADIX_DAX_TYPE(addr));
+	if (pmd_entry)
+		error = radix_tree_insert(page_tree, pgoff,
+				RADIX_DAX_PMD_ENTRY(addr));
+	else
+		error = radix_tree_insert(page_tree, pgoff,
+				RADIX_DAX_PTE_ENTRY(addr));
+
+	if (error)
+		goto out;
+
+	mapping->nrdax++;
+ dirty:
+	radix_tree_tag_set(page_tree, pgoff, PAGECACHE_TAG_DIRTY);
+ out:
+	spin_unlock_irq(&mapping->tree_lock);
+	return error;
+}
+
 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 			struct vm_area_struct *vma, struct vm_fault *vmf)
 {
@@ -327,7 +376,10 @@  static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 	}
 
 	error = vm_insert_mixed(vma, vaddr, pfn);
+	if (error)
+		goto out;
 
+	error = dax_dirty_pgoff(mapping, vmf->pgoff, addr, false);
  out:
 	i_mmap_unlock_read(mapping);
 
@@ -450,6 +502,7 @@  int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		delete_from_page_cache(page);
 		unlock_page(page);
 		page_cache_release(page);
+		page = NULL;
 	}
 
 	/*
@@ -537,7 +590,7 @@  int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	pgoff_t size, pgoff;
 	sector_t block, sector;
 	unsigned long pfn;
-	int result = 0;
+	int error, result = 0;
 
 	/* Fall back to PTEs if we're going to COW */
 	if (write && !(vma->vm_flags & VM_SHARED))
@@ -638,6 +691,10 @@  int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		}
 
 		result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
+
+		error = dax_dirty_pgoff(mapping, pgoff, kaddr, true);
+		if (error)
+			goto fallback;
 	}
 
  out:
@@ -689,15 +746,12 @@  EXPORT_SYMBOL_GPL(dax_pmd_fault);
  * dax_pfn_mkwrite - handle first write to DAX page
  * @vma: The virtual memory area where the fault occurred
  * @vmf: The description of the fault
- *
  */
 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+	struct file *file = vma->vm_file;
 
-	sb_start_pagefault(sb);
-	file_update_time(vma->vm_file);
-	sb_end_pagefault(sb);
+	dax_dirty_pgoff(file->f_mapping, vmf->pgoff, NULL, false);
 	return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -772,3 +826,77 @@  int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
 	return dax_zero_page_range(inode, from, length, get_block);
 }
 EXPORT_SYMBOL_GPL(dax_truncate_page);
+
+static void dax_sync_entry(struct address_space *mapping, pgoff_t pgoff,
+		void *entry)
+{
+	struct radix_tree_root *page_tree = &mapping->page_tree;
+	int type = RADIX_DAX_TYPE(entry);
+	size_t size;
+
+	BUG_ON(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD);
+
+	spin_lock_irq(&mapping->tree_lock);
+	if (!radix_tree_tag_get(page_tree, pgoff, PAGECACHE_TAG_TOWRITE)) {
+		/* another fsync thread already wrote back this entry */
+		spin_unlock_irq(&mapping->tree_lock);
+		return;
+	}
+	radix_tree_tag_clear(page_tree, pgoff, PAGECACHE_TAG_TOWRITE);
+	radix_tree_tag_clear(page_tree, pgoff, PAGECACHE_TAG_DIRTY);
+	spin_unlock_irq(&mapping->tree_lock);
+
+	if (type == RADIX_DAX_PMD)
+		size = PMD_SIZE;
+	else
+		size = PAGE_SIZE;
+
+	wb_cache_pmem(RADIX_DAX_ADDR(entry), size);
+	pgoff_mkclean(pgoff, mapping);
+}
+
+/*
+ * Flush the mapping to the persistent domain within the byte range of (start,
+ * end). This is required by data integrity operations to ensure file data is on
+ * persistent storage prior to completion of the operation. It also requires us
+ * to clean the mappings (i.e. write -> RO) so that we'll get a new fault when
+ * the file is written to again so we have an indication that we need to flush
+ * the mapping if a data integrity operation takes place.
+ *
+ * We don't need commits to storage here - the filesystems will issue flushes
+ * appropriately at the conclusion of the data integrity operation via REQ_FUA
+ * writes or blkdev_issue_flush() commands.  This requires the DAX block device
+ * to implement persistent storage domain fencing/commits on receiving a
+ * REQ_FLUSH or REQ_FUA request so that this works as expected by the higher
+ * layers.
+ */
+void dax_fsync(struct address_space *mapping, loff_t start, loff_t end)
+{
+	struct inode *inode = mapping->host;
+	pgoff_t indices[PAGEVEC_SIZE];
+	struct pagevec pvec;
+	int i;
+
+	pgoff_t start_page = start >> PAGE_CACHE_SHIFT;
+	pgoff_t end_page = end >> PAGE_CACHE_SHIFT;
+
+	if (mapping->nrdax == 0)
+		return;
+
+	BUG_ON(inode->i_blkbits != PAGE_SHIFT);
+
+	tag_pages_for_writeback(mapping, start_page, end_page);
+
+	pagevec_init(&pvec, 0);
+	while (1) {
+		pvec.nr = find_get_entries_tag(mapping, start_page,
+				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
+				pvec.pages, indices);
+
+		if (pvec.nr == 0)
+			break;
+
+		for (i = 0; i < pvec.nr; i++)
+			dax_sync_entry(mapping, indices[i], pvec.pages[i]);
+	}
+}
diff --git a/include/linux/dax.h b/include/linux/dax.h
index e9d57f68..2b3ce6f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -41,4 +41,5 @@  static inline bool dax_mapping(struct address_space *mapping)
 {
 	return mapping->host && IS_DAX(mapping->host);
 }
+void dax_fsync(struct address_space *mapping, loff_t start, loff_t end);
 #endif
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bbac913..1b3df56 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -877,15 +877,13 @@  static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 	spinlock_t *ptl;
 
 	ptl = pmd_lock(mm, pmd);
-	if (pmd_none(*pmd)) {
-		entry = pmd_mkhuge(pfn_pmd(pfn, prot));
-		if (write) {
-			entry = pmd_mkyoung(pmd_mkdirty(entry));
-			entry = maybe_pmd_mkwrite(entry, vma);
-		}
-		set_pmd_at(mm, addr, pmd, entry);
-		update_mmu_cache_pmd(vma, addr, pmd);
+	entry = pmd_mkhuge(pfn_pmd(pfn, prot));
+	if (write) {
+		entry = pmd_mkyoung(pmd_mkdirty(entry));
+		entry = maybe_pmd_mkwrite(entry, vma);
 	}
+	set_pmd_at(mm, addr, pmd, entry);
+	update_mmu_cache_pmd(vma, addr, pmd);
 	spin_unlock(ptl);
 }