diff mbox

+ ext4-add-dax-functionality.patch added to -mm tree

Message ID 20150220221551.GB2780@wil.cx
State Not Applicable, archived
Headers show

Commit Message

Matthew Wilcox Feb. 20, 2015, 10:15 p.m. UTC
> So to handle this it can start transaction in ext4_dax_fault() /
> ext4_dax_mkwrite() if write is requested and call ext4_jbd2_file_inode()
> after dax_fault() / dax_mkwrite() returns. Complete function will look
> something like follows:

How about this?  I tried to encompass both the unwritten extent conversion
as well as starting the journal at the right point in the locking hierarchy.

If we're going to expose do_dax_fault(), I think it needs to be called
__dax_fault().

I decided to return VM_FAULT_RETRY and a new flag VM_FAULT_UNWRITTEN from
__dax_fault(), rather than convert it to return an errno.

P.S. I love patches which touch *both* fs.h *and* mm.h.  In case there
were any files that weren't already being rebuilt.

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Jan Kara Feb. 23, 2015, 12:52 p.m. UTC | #1
On Fri 20-02-15 17:15:51, Matthew Wilcox wrote:
> > So to handle this it can start transaction in ext4_dax_fault() /
> > ext4_dax_mkwrite() if write is requested and call ext4_jbd2_file_inode()
> > after dax_fault() / dax_mkwrite() returns. Complete function will look
> > something like follows:
> 
> How about this?  I tried to encompass both the unwritten extent conversion
> as well as starting the journal at the right point in the locking hierarchy.
> 
> If we're going to expose do_dax_fault(), I think it needs to be called
> __dax_fault().
> 
> I decided to return VM_FAULT_RETRY and a new flag VM_FAULT_UNWRITTEN from
> __dax_fault(), rather than convert it to return an errno.
  I don't like using VM_FAULT_RETRY for ENOSPC. Different filesystems may
want different things on this condition. In particular, if a filesystem
decides to use dax_fault(), VM_FAULT_RETRY will get propagated up into mm
code which just retries the fault (or gets confused if FAULT_FLAG_ALLOW_RETRY
wasn't set).

If you want to stay with VM_FAULT_XXX return values (which makes some sense),
then I guess you need something like VM_FAULT_ENOSPC and convert that to
VM_FAULT_SIGBUS in dax_fault().

Otherwise the patch looks good.

								Honza

> P.S. I love patches which touch *both* fs.h *and* mm.h.  In case there
> were any files that weren't already being rebuilt.
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index 556238f..81dbdaa 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -316,7 +316,7 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
>  	return error;
>  }
>  
> -static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> +int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  			get_block_t get_block)
>  {
>  	struct file *file = vma->vm_file;
> @@ -329,7 +329,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	sector_t block;
>  	pgoff_t size;
>  	int error;
> -	int major = 0;
> +	int ret = 0;
>  
>  	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
>  	if (vmf->pgoff >= size)
> @@ -367,13 +367,15 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  		error = -EIO;		/* fs corruption? */
>  	if (error)
>  		goto unlock_page;
> +	if (buffer_unwritten(&bh))
> +		ret |= VM_FAULT_UNWRITTEN;
>  
>  	if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
>  		if (vmf->flags & FAULT_FLAG_WRITE) {
>  			error = get_block(inode, block, &bh, 1);
>  			count_vm_event(PGMAJFAULT);
>  			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
> -			major = VM_FAULT_MAJOR;
> +			ret = VM_FAULT_MAJOR;
>  			if (!error && (bh.b_size < PAGE_SIZE))
>  				error = -EIO;
>  			if (error)
> @@ -407,7 +409,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	}
>  
>  	/* Check we didn't race with a read fault installing a new page */
> -	if (!page && major)
> +	if (!page && (ret & VM_FAULT_MAJOR))
>  		page = find_lock_page(mapping, vmf->pgoff);
>  
>  	if (page) {
> @@ -421,12 +423,14 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	error = dax_insert_mapping(inode, &bh, vma, vmf);
>  
>   out:
> +	if (error == -ENOSPC)
> +		return VM_FAULT_RETRY | ret;
>  	if (error == -ENOMEM)
> -		return VM_FAULT_OOM | major;
> +		return VM_FAULT_OOM | ret;
>  	/* -EBUSY is fine, somebody else faulted on the same PTE */
>  	if ((error < 0) && (error != -EBUSY))
> -		return VM_FAULT_SIGBUS | major;
> -	return VM_FAULT_NOPAGE | major;
> +		return VM_FAULT_SIGBUS | ret;
> +	return VM_FAULT_NOPAGE | ret;
>  
>   unlock_page:
>  	if (page) {
> @@ -435,6 +439,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	}
>  	goto out;
>  }
> +EXPORT_SYMBOL_GPL(__dax_fault);
>  
>  /**
>   * dax_fault - handle a page fault on a DAX file
> @@ -455,7 +460,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  		sb_start_pagefault(sb);
>  		file_update_time(vma->vm_file);
>  	}
> -	result = do_dax_fault(vma, vmf, get_block);
> +	result = __dax_fault(vma, vmf, get_block);
>  	if (vmf->flags & FAULT_FLAG_WRITE)
>  		sb_end_pagefault(sb);
>  
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 4340e38..84b4f1c 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -194,7 +194,58 @@ errout:
>  #ifdef CONFIG_FS_DAX
>  static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> -	return dax_fault(vma, vmf, ext4_get_block_write);
> +	handle_t *handle;
> +	int create = (vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page;
> +	struct inode *inode = file_inode(vma->vm_file);
> +	int ret, err = 0;
> +	int retries = 0;
> +
> +	if (create) {
> +		sb_start_pagefault(inode->i_sb);
> +		file_update_time(vma->vm_file);
> + retry_alloc:
> +		handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
> +					ext4_writepage_trans_blocks(inode));
> +		if (IS_ERR(handle)) {
> +			err = PTR_ERR(handle);
> +			goto err;
> +		}
> +	}
> +
> +	ret = __dax_fault(vma, vmf, ext4_get_block);
> +
> +	if (create) {
> +		if (ret & VM_FAULT_UNWRITTEN) {
> +			loff_t offset = (loff_t)vmf->pgoff << PAGE_SHIFT;
> +			err = ext4_convert_unwritten_extents(NULL, inode,
> +							offset, PAGE_SIZE);
> +			ret &= ~VM_FAULT_UNWRITTEN;
> +		}
> +		if (!err &&
> +		    ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
> +			err = ext4_jbd2_file_inode(handle, inode);
> +
> +		if (err == -ENOSPC) {
> +			ret |= VM_FAULT_RETRY;
> +			err = 0;
> +		}
> +
> +		ext4_journal_stop(handle);
> +		if (err < 0)
> +			goto err;
> +		if ((ret & VM_FAULT_RETRY) &&
> +		    ext4_should_retry_alloc(inode->i_sb, &retries))
> +			goto retry_alloc;
> +		ret &= ~VM_FAULT_RETRY;
> +	}
> +
> + out:
> +	if (create)
> +		sb_end_pagefault(inode->i_sb);
> +	return ret;
> + err:
> +	ret = block_page_mkwrite_return(err);
> +	goto out;
>  }
>  
>  static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 85404f1..8f1ea7d 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -657,18 +657,6 @@ has_zeroout:
>  	return retval;
>  }
>  
> -static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
> -{
> -	struct inode *inode = bh->b_assoc_map->host;
> -	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
> -	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
> -	int err;
> -	if (!uptodate)
> -		return;
> -	WARN_ON(!buffer_unwritten(bh));
> -	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
> -}
> -
>  /* Maximum number of blocks we map for direct IO at once. */
>  #define DIO_MAX_BLOCKS 4096
>  
> @@ -706,11 +694,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
>  
>  		map_bh(bh, inode->i_sb, map.m_pblk);
>  		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
> -		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
> -			bh->b_assoc_map = inode->i_mapping;
> -			bh->b_private = (void *)(unsigned long)iblock;
> -			bh->b_end_io = ext4_end_io_unwritten;
> -		}
>  		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
>  			set_buffer_defer_completion(bh);
>  		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 239c89c..2af5050 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2597,6 +2597,7 @@ int dax_clear_blocks(struct inode *, sector_t block, long size);
>  int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
>  int dax_truncate_page(struct inode *, loff_t from, get_block_t);
>  int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
> +int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
>  int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
>  					unsigned int flags, get_block_t);
>  #define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ceb50ec..ffc9947 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1100,7 +1100,7 @@ static inline int page_mapped(struct page *page)
>  #define VM_FAULT_HWPOISON 0x0010	/* Hit poisoned small page */
>  #define VM_FAULT_HWPOISON_LARGE 0x0020  /* Hit poisoned large page. Index encoded in upper bits */
>  #define VM_FAULT_SIGSEGV 0x0040
> -
> +#define VM_FAULT_UNWRITTEN 0x0080	/* Unwritten extent needs conversion */
>  #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
>  #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
>  #define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
diff mbox

Patch

diff --git a/fs/dax.c b/fs/dax.c
index 556238f..81dbdaa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -316,7 +316,7 @@  static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 	return error;
 }
 
-static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			get_block_t get_block)
 {
 	struct file *file = vma->vm_file;
@@ -329,7 +329,7 @@  static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	sector_t block;
 	pgoff_t size;
 	int error;
-	int major = 0;
+	int ret = 0;
 
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
@@ -367,13 +367,15 @@  static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		error = -EIO;		/* fs corruption? */
 	if (error)
 		goto unlock_page;
+	if (buffer_unwritten(&bh))
+		ret |= VM_FAULT_UNWRITTEN;
 
 	if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
 		if (vmf->flags & FAULT_FLAG_WRITE) {
 			error = get_block(inode, block, &bh, 1);
 			count_vm_event(PGMAJFAULT);
 			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-			major = VM_FAULT_MAJOR;
+			ret = VM_FAULT_MAJOR;
 			if (!error && (bh.b_size < PAGE_SIZE))
 				error = -EIO;
 			if (error)
@@ -407,7 +409,7 @@  static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	}
 
 	/* Check we didn't race with a read fault installing a new page */
-	if (!page && major)
+	if (!page && (ret & VM_FAULT_MAJOR))
 		page = find_lock_page(mapping, vmf->pgoff);
 
 	if (page) {
@@ -421,12 +423,14 @@  static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	error = dax_insert_mapping(inode, &bh, vma, vmf);
 
  out:
+	if (error == -ENOSPC)
+		return VM_FAULT_RETRY | ret;
 	if (error == -ENOMEM)
-		return VM_FAULT_OOM | major;
+		return VM_FAULT_OOM | ret;
 	/* -EBUSY is fine, somebody else faulted on the same PTE */
 	if ((error < 0) && (error != -EBUSY))
-		return VM_FAULT_SIGBUS | major;
-	return VM_FAULT_NOPAGE | major;
+		return VM_FAULT_SIGBUS | ret;
+	return VM_FAULT_NOPAGE | ret;
 
  unlock_page:
 	if (page) {
@@ -435,6 +439,7 @@  static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	}
 	goto out;
 }
+EXPORT_SYMBOL_GPL(__dax_fault);
 
 /**
  * dax_fault - handle a page fault on a DAX file
@@ -455,7 +460,7 @@  int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
 	}
-	result = do_dax_fault(vma, vmf, get_block);
+	result = __dax_fault(vma, vmf, get_block);
 	if (vmf->flags & FAULT_FLAG_WRITE)
 		sb_end_pagefault(sb);
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4340e38..84b4f1c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -194,7 +194,58 @@  errout:
 #ifdef CONFIG_FS_DAX
 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return dax_fault(vma, vmf, ext4_get_block_write);
+	handle_t *handle;
+	int create = (vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page;
+	struct inode *inode = file_inode(vma->vm_file);
+	int ret, err = 0;
+	int retries = 0;
+
+	if (create) {
+		sb_start_pagefault(inode->i_sb);
+		file_update_time(vma->vm_file);
+ retry_alloc:
+		handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+					ext4_writepage_trans_blocks(inode));
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto err;
+		}
+	}
+
+	ret = __dax_fault(vma, vmf, ext4_get_block);
+
+	if (create) {
+		if (ret & VM_FAULT_UNWRITTEN) {
+			loff_t offset = (loff_t)vmf->pgoff << PAGE_SHIFT;
+			err = ext4_convert_unwritten_extents(NULL, inode,
+							offset, PAGE_SIZE);
+			ret &= ~VM_FAULT_UNWRITTEN;
+		}
+		if (!err &&
+		    ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+			err = ext4_jbd2_file_inode(handle, inode);
+
+		if (err == -ENOSPC) {
+			ret |= VM_FAULT_RETRY;
+			err = 0;
+		}
+
+		ext4_journal_stop(handle);
+		if (err < 0)
+			goto err;
+		if ((ret & VM_FAULT_RETRY) &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry_alloc;
+		ret &= ~VM_FAULT_RETRY;
+	}
+
+ out:
+	if (create)
+		sb_end_pagefault(inode->i_sb);
+	return ret;
+ err:
+	ret = block_page_mkwrite_return(err);
+	goto out;
 }
 
 static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 85404f1..8f1ea7d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -657,18 +657,6 @@  has_zeroout:
 	return retval;
 }
 
-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
-{
-	struct inode *inode = bh->b_assoc_map->host;
-	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
-	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
-	int err;
-	if (!uptodate)
-		return;
-	WARN_ON(!buffer_unwritten(bh));
-	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
-}
-
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 
@@ -706,11 +694,6 @@  static int _ext4_get_block(struct inode *inode, sector_t iblock,
 
 		map_bh(bh, inode->i_sb, map.m_pblk);
 		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
-		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
-			bh->b_assoc_map = inode->i_mapping;
-			bh->b_private = (void *)(unsigned long)iblock;
-			bh->b_end_io = ext4_end_io_unwritten;
-		}
 		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
 			set_buffer_defer_completion(bh);
 		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 239c89c..2af5050 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2597,6 +2597,7 @@  int dax_clear_blocks(struct inode *, sector_t block, long size);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
 					unsigned int flags, get_block_t);
 #define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ceb50ec..ffc9947 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1100,7 +1100,7 @@  static inline int page_mapped(struct page *page)
 #define VM_FAULT_HWPOISON 0x0010	/* Hit poisoned small page */
 #define VM_FAULT_HWPOISON_LARGE 0x0020  /* Hit poisoned large page. Index encoded in upper bits */
 #define VM_FAULT_SIGSEGV 0x0040
-
+#define VM_FAULT_UNWRITTEN 0x0080	/* Unwritten extent needs conversion */
 #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
 #define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */