diff mbox

mm: replace FAULT_FLAG_SIZE with parameter to huge_fault

Message ID 148615748258.43180.1690152053774975329.stgit@djiang5-desk3.ch.intel.com
State Not Applicable
Headers show

Commit Message

Dave Jiang Feb. 3, 2017, 9:31 p.m. UTC
Since the introduction of FAULT_FLAG_SIZE to the vm_fault flag, it has
been somewhat painful with getting the flags set and removed at the
correct locations. More than one kernel oops was introduced due to
difficulties of getting the placement correctly. Removing the flag
values and introducing an input parameter to huge_fault that indicates
the size of the page entry. This makes the code easier to trace and
should avoid the issues we see with the fault flags where removal of the
flag was necessary in the fallback paths.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/dax/dax.c   |   18 ++++++++++++------
 fs/dax.c            |    9 +++++----
 fs/ext2/file.c      |    2 +-
 fs/ext4/file.c      |   12 +++++++++---
 fs/xfs/xfs_file.c   |    9 +++++----
 include/linux/dax.h |    3 ++-
 include/linux/mm.h  |   14 ++++++++------
 mm/memory.c         |   17 ++++-------------
 8 files changed, 46 insertions(+), 38 deletions(-)

Comments

Dan Williams Feb. 3, 2017, 9:47 p.m. UTC | #1
On Fri, Feb 3, 2017 at 1:31 PM, Dave Jiang <dave.jiang@intel.com> wrote:
> Since the introduction of FAULT_FLAG_SIZE to the vm_fault flag, it has
> been somewhat painful with getting the flags set and removed at the
> correct locations. More than one kernel oops was introduced due to
> difficulties of getting the placement correctly. Removing the flag
> values and introducing an input parameter to huge_fault that indicates
> the size of the page entry. This makes the code easier to trace and
> should avoid the issues we see with the fault flags where removal of the
> flag was necessary in the fallback paths.
>
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>

Tested-by: Dan Williams <dan.j.williams@intel.com>

This fixes a crash I can produce with the existing ndctl unit tests
[1] on next-20170202.  Now to go extend the tests to go after the PUD
case...

[1]: https://github.com/pmem/ndctl
kernel test robot Feb. 3, 2017, 10:56 p.m. UTC | #2
Hi Dave,

[auto build test ERROR on mmotm/master]
[cannot apply to linus/master linux/master v4.10-rc6 next-20170203]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Dave-Jiang/mm-replace-FAULT_FLAG_SIZE-with-parameter-to-huge_fault/20170204-053548
base:   git://git.cmpxchg.org/linux-mmotm.git master
config: i386-randconfig-x004-201705 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

>> fs/ext4/file.c:280:1: error: conflicting types for 'ext4_dax_huge_fault'
    ext4_dax_huge_fault(struct vm_fault *vmf)
    ^~~~~~~~~~~~~~~~~~~
   fs/ext4/file.c:258:12: note: previous definition of 'ext4_dax_huge_fault' was here
    static int ext4_dax_huge_fault(struct vm_fault *vmf,
               ^~~~~~~~~~~~~~~~~~~
   fs/ext4/file.c: In function 'ext4_dax_huge_fault':
>> fs/ext4/file.c:292:32: error: incompatible type for argument 2 of 'dax_iomap_fault'
     result = dax_iomap_fault(vmf, &ext4_iomap_ops);
                                   ^
   In file included from fs/ext4/file.c:25:0:
   include/linux/dax.h:41:5: note: expected 'enum page_entry_size' but argument is of type 'struct iomap_ops *'
    int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
        ^~~~~~~~~~~~~~~
>> fs/ext4/file.c:292:11: error: too few arguments to function 'dax_iomap_fault'
     result = dax_iomap_fault(vmf, &ext4_iomap_ops);
              ^~~~~~~~~~~~~~~
   In file included from fs/ext4/file.c:25:0:
   include/linux/dax.h:41:5: note: declared here
    int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
        ^~~~~~~~~~~~~~~
   fs/ext4/file.c: In function 'ext4_dax_fault':
>> fs/ext4/file.c:302:9: error: too many arguments to function 'ext4_dax_huge_fault'
     return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
            ^~~~~~~~~~~~~~~~~~~
   fs/ext4/file.c:280:1: note: declared here
    ext4_dax_huge_fault(struct vm_fault *vmf)
    ^~~~~~~~~~~~~~~~~~~
   fs/ext4/file.c: At top level:
>> fs/ext4/file.c:337:16: error: initialization from incompatible pointer type [-Werror=incompatible-pointer-types]
     .huge_fault = ext4_dax_huge_fault,
                   ^~~~~~~~~~~~~~~~~~~
   fs/ext4/file.c:337:16: note: (near initialization for 'ext4_dax_vm_ops.huge_fault')
   fs/ext4/file.c:258:12: warning: 'ext4_dax_huge_fault' defined but not used [-Wunused-function]
    static int ext4_dax_huge_fault(struct vm_fault *vmf,
               ^~~~~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors

vim +/ext4_dax_huge_fault +280 fs/ext4/file.c

01a33b4ac Matthew Wilcox 2015-09-08  274  		sb_end_pagefault(sb);
01a33b4ac Matthew Wilcox 2015-09-08  275  
01a33b4ac Matthew Wilcox 2015-09-08  276  	return result;
923ae0ff9 Ross Zwisler   2015-02-16  277  }
923ae0ff9 Ross Zwisler   2015-02-16  278  
c6da0697e Dave Jiang     2017-02-02  279  static int
30599588c Dave Jiang     2017-02-02 @280  ext4_dax_huge_fault(struct vm_fault *vmf)
11bd1a9ec Matthew Wilcox 2015-09-08  281  {
01a33b4ac Matthew Wilcox 2015-09-08  282  	int result;
e6ae40ec2 Dave Jiang     2017-02-02  283  	struct inode *inode = file_inode(vmf->vma->vm_file);
01a33b4ac Matthew Wilcox 2015-09-08  284  	struct super_block *sb = inode->i_sb;
c6da0697e Dave Jiang     2017-02-02  285  	bool write = vmf->flags & FAULT_FLAG_WRITE;
01a33b4ac Matthew Wilcox 2015-09-08  286  
01a33b4ac Matthew Wilcox 2015-09-08  287  	if (write) {
01a33b4ac Matthew Wilcox 2015-09-08  288  		sb_start_pagefault(sb);
e6ae40ec2 Dave Jiang     2017-02-02  289  		file_update_time(vmf->vma->vm_file);
1db175428 Jan Kara       2016-10-21  290  	}
ea3d7209c Jan Kara       2015-12-07  291  	down_read(&EXT4_I(inode)->i_mmap_sem);
30599588c Dave Jiang     2017-02-02 @292  	result = dax_iomap_fault(vmf, &ext4_iomap_ops);
ea3d7209c Jan Kara       2015-12-07  293  	up_read(&EXT4_I(inode)->i_mmap_sem);
1db175428 Jan Kara       2016-10-21  294  	if (write)
01a33b4ac Matthew Wilcox 2015-09-08  295  		sb_end_pagefault(sb);
01a33b4ac Matthew Wilcox 2015-09-08  296  
01a33b4ac Matthew Wilcox 2015-09-08  297  	return result;
11bd1a9ec Matthew Wilcox 2015-09-08  298  }
11bd1a9ec Matthew Wilcox 2015-09-08  299  
22711acc4 Dave Jiang     2017-02-03  300  static int ext4_dax_fault(struct vm_fault *vmf)
22711acc4 Dave Jiang     2017-02-03  301  {
22711acc4 Dave Jiang     2017-02-03 @302  	return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
22711acc4 Dave Jiang     2017-02-03  303  }
22711acc4 Dave Jiang     2017-02-03  304  
ea3d7209c Jan Kara       2015-12-07  305  /*
1e9d180ba Ross Zwisler   2016-02-27  306   * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
ea3d7209c Jan Kara       2015-12-07  307   * handler we check for races agaist truncate. Note that since we cycle through
ea3d7209c Jan Kara       2015-12-07  308   * i_mmap_sem, we are sure that also any hole punching that began before we
ea3d7209c Jan Kara       2015-12-07  309   * were called is finished by now and so if it included part of the file we
ea3d7209c Jan Kara       2015-12-07  310   * are working on, our pte will get unmapped and the check for pte_same() in
ea3d7209c Jan Kara       2015-12-07  311   * wp_pfn_shared() fails. Thus fault gets retried and things work out as
ea3d7209c Jan Kara       2015-12-07  312   * desired.
ea3d7209c Jan Kara       2015-12-07  313   */
1ebf3e0da Dave Jiang     2017-02-02  314  static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
ea3d7209c Jan Kara       2015-12-07  315  {
1ebf3e0da Dave Jiang     2017-02-02  316  	struct inode *inode = file_inode(vmf->vma->vm_file);
ea3d7209c Jan Kara       2015-12-07  317  	struct super_block *sb = inode->i_sb;
ea3d7209c Jan Kara       2015-12-07  318  	loff_t size;
d5be7a03b Ross Zwisler   2016-01-22  319  	int ret;
ea3d7209c Jan Kara       2015-12-07  320  
ea3d7209c Jan Kara       2015-12-07  321  	sb_start_pagefault(sb);
1ebf3e0da Dave Jiang     2017-02-02  322  	file_update_time(vmf->vma->vm_file);
ea3d7209c Jan Kara       2015-12-07  323  	down_read(&EXT4_I(inode)->i_mmap_sem);
ea3d7209c Jan Kara       2015-12-07  324  	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
ea3d7209c Jan Kara       2015-12-07  325  	if (vmf->pgoff >= size)
ea3d7209c Jan Kara       2015-12-07  326  		ret = VM_FAULT_SIGBUS;
d5be7a03b Ross Zwisler   2016-01-22  327  	else
1ebf3e0da Dave Jiang     2017-02-02  328  		ret = dax_pfn_mkwrite(vmf);
ea3d7209c Jan Kara       2015-12-07  329  	up_read(&EXT4_I(inode)->i_mmap_sem);
ea3d7209c Jan Kara       2015-12-07  330  	sb_end_pagefault(sb);
ea3d7209c Jan Kara       2015-12-07  331  
ea3d7209c Jan Kara       2015-12-07  332  	return ret;
923ae0ff9 Ross Zwisler   2015-02-16  333  }
923ae0ff9 Ross Zwisler   2015-02-16  334  
923ae0ff9 Ross Zwisler   2015-02-16  335  static const struct vm_operations_struct ext4_dax_vm_ops = {
923ae0ff9 Ross Zwisler   2015-02-16  336  	.fault		= ext4_dax_fault,
22711acc4 Dave Jiang     2017-02-03 @337  	.huge_fault	= ext4_dax_huge_fault,
1e9d180ba Ross Zwisler   2016-02-27  338  	.page_mkwrite	= ext4_dax_fault,
ea3d7209c Jan Kara       2015-12-07  339  	.pfn_mkwrite	= ext4_dax_pfn_mkwrite,
923ae0ff9 Ross Zwisler   2015-02-16  340  };

:::::: The code at line 280 was first introduced by commit
:::::: 30599588c9eaccc211d383c9974a3a88dfa6e7d5 mm,fs,dax: change ->pmd_fault to ->huge_fault

:::::: TO: Dave Jiang <dave.jiang@intel.com>
:::::: CC: Johannes Weiner <hannes@cmpxchg.org>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Dave Jiang Feb. 3, 2017, 11:25 p.m. UTC | #3
On 02/03/2017 03:56 PM, kbuild test robot wrote:
> Hi Dave,
> 
> [auto build test ERROR on mmotm/master]
> [cannot apply to linus/master linux/master v4.10-rc6 next-20170203]
> [if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

This one is a bit odd. I just pulled mmotm tree master branch and built
with the attached .config and it passed for me (and I don't see this
commit in the master branch). I also built linux-next with this patch on
top and it also passes with attached .config. Looking at the err log
below it seems the code has a mix of partial from before and after the
patch. I'm rather confused about it....


> 
> url:    https://github.com/0day-ci/linux/commits/Dave-Jiang/mm-replace-FAULT_FLAG_SIZE-with-parameter-to-huge_fault/20170204-053548
> base:   git://git.cmpxchg.org/linux-mmotm.git master
> config: i386-randconfig-x004-201705 (attached as .config)
> compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
> reproduce:
>         # save the attached .config to linux build tree
>         make ARCH=i386 
> 
> All errors (new ones prefixed by >>):
> 
>>> fs/ext4/file.c:280:1: error: conflicting types for 'ext4_dax_huge_fault'
>     ext4_dax_huge_fault(struct vm_fault *vmf)
>     ^~~~~~~~~~~~~~~~~~~
>    fs/ext4/file.c:258:12: note: previous definition of 'ext4_dax_huge_fault' was here
>     static int ext4_dax_huge_fault(struct vm_fault *vmf,
>                ^~~~~~~~~~~~~~~~~~~
>    fs/ext4/file.c: In function 'ext4_dax_huge_fault':
>>> fs/ext4/file.c:292:32: error: incompatible type for argument 2 of 'dax_iomap_fault'
>      result = dax_iomap_fault(vmf, &ext4_iomap_ops);
>                                    ^
>    In file included from fs/ext4/file.c:25:0:
>    include/linux/dax.h:41:5: note: expected 'enum page_entry_size' but argument is of type 'struct iomap_ops *'
>     int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
>         ^~~~~~~~~~~~~~~
>>> fs/ext4/file.c:292:11: error: too few arguments to function 'dax_iomap_fault'
>      result = dax_iomap_fault(vmf, &ext4_iomap_ops);
>               ^~~~~~~~~~~~~~~
>    In file included from fs/ext4/file.c:25:0:
>    include/linux/dax.h:41:5: note: declared here
>     int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
>         ^~~~~~~~~~~~~~~
>    fs/ext4/file.c: In function 'ext4_dax_fault':
>>> fs/ext4/file.c:302:9: error: too many arguments to function 'ext4_dax_huge_fault'
>      return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
>             ^~~~~~~~~~~~~~~~~~~
>    fs/ext4/file.c:280:1: note: declared here
>     ext4_dax_huge_fault(struct vm_fault *vmf)
>     ^~~~~~~~~~~~~~~~~~~
>    fs/ext4/file.c: At top level:
>>> fs/ext4/file.c:337:16: error: initialization from incompatible pointer type [-Werror=incompatible-pointer-types]
>      .huge_fault = ext4_dax_huge_fault,
>                    ^~~~~~~~~~~~~~~~~~~
>    fs/ext4/file.c:337:16: note: (near initialization for 'ext4_dax_vm_ops.huge_fault')
>    fs/ext4/file.c:258:12: warning: 'ext4_dax_huge_fault' defined but not used [-Wunused-function]
>     static int ext4_dax_huge_fault(struct vm_fault *vmf,
>                ^~~~~~~~~~~~~~~~~~~
>    cc1: some warnings being treated as errors
> 
> vim +/ext4_dax_huge_fault +280 fs/ext4/file.c
> 
> 01a33b4ac Matthew Wilcox 2015-09-08  274  		sb_end_pagefault(sb);
> 01a33b4ac Matthew Wilcox 2015-09-08  275  
> 01a33b4ac Matthew Wilcox 2015-09-08  276  	return result;
> 923ae0ff9 Ross Zwisler   2015-02-16  277  }
> 923ae0ff9 Ross Zwisler   2015-02-16  278  
> c6da0697e Dave Jiang     2017-02-02  279  static int
> 30599588c Dave Jiang     2017-02-02 @280  ext4_dax_huge_fault(struct vm_fault *vmf)
> 11bd1a9ec Matthew Wilcox 2015-09-08  281  {
> 01a33b4ac Matthew Wilcox 2015-09-08  282  	int result;
> e6ae40ec2 Dave Jiang     2017-02-02  283  	struct inode *inode = file_inode(vmf->vma->vm_file);
> 01a33b4ac Matthew Wilcox 2015-09-08  284  	struct super_block *sb = inode->i_sb;
> c6da0697e Dave Jiang     2017-02-02  285  	bool write = vmf->flags & FAULT_FLAG_WRITE;
> 01a33b4ac Matthew Wilcox 2015-09-08  286  
> 01a33b4ac Matthew Wilcox 2015-09-08  287  	if (write) {
> 01a33b4ac Matthew Wilcox 2015-09-08  288  		sb_start_pagefault(sb);
> e6ae40ec2 Dave Jiang     2017-02-02  289  		file_update_time(vmf->vma->vm_file);
> 1db175428 Jan Kara       2016-10-21  290  	}
> ea3d7209c Jan Kara       2015-12-07  291  	down_read(&EXT4_I(inode)->i_mmap_sem);
> 30599588c Dave Jiang     2017-02-02 @292  	result = dax_iomap_fault(vmf, &ext4_iomap_ops);
> ea3d7209c Jan Kara       2015-12-07  293  	up_read(&EXT4_I(inode)->i_mmap_sem);
> 1db175428 Jan Kara       2016-10-21  294  	if (write)
> 01a33b4ac Matthew Wilcox 2015-09-08  295  		sb_end_pagefault(sb);
> 01a33b4ac Matthew Wilcox 2015-09-08  296  
> 01a33b4ac Matthew Wilcox 2015-09-08  297  	return result;
> 11bd1a9ec Matthew Wilcox 2015-09-08  298  }
> 11bd1a9ec Matthew Wilcox 2015-09-08  299  
> 22711acc4 Dave Jiang     2017-02-03  300  static int ext4_dax_fault(struct vm_fault *vmf)
> 22711acc4 Dave Jiang     2017-02-03  301  {
> 22711acc4 Dave Jiang     2017-02-03 @302  	return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
> 22711acc4 Dave Jiang     2017-02-03  303  }
> 22711acc4 Dave Jiang     2017-02-03  304  
> ea3d7209c Jan Kara       2015-12-07  305  /*
> 1e9d180ba Ross Zwisler   2016-02-27  306   * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
> ea3d7209c Jan Kara       2015-12-07  307   * handler we check for races agaist truncate. Note that since we cycle through
> ea3d7209c Jan Kara       2015-12-07  308   * i_mmap_sem, we are sure that also any hole punching that began before we
> ea3d7209c Jan Kara       2015-12-07  309   * were called is finished by now and so if it included part of the file we
> ea3d7209c Jan Kara       2015-12-07  310   * are working on, our pte will get unmapped and the check for pte_same() in
> ea3d7209c Jan Kara       2015-12-07  311   * wp_pfn_shared() fails. Thus fault gets retried and things work out as
> ea3d7209c Jan Kara       2015-12-07  312   * desired.
> ea3d7209c Jan Kara       2015-12-07  313   */
> 1ebf3e0da Dave Jiang     2017-02-02  314  static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
> ea3d7209c Jan Kara       2015-12-07  315  {
> 1ebf3e0da Dave Jiang     2017-02-02  316  	struct inode *inode = file_inode(vmf->vma->vm_file);
> ea3d7209c Jan Kara       2015-12-07  317  	struct super_block *sb = inode->i_sb;
> ea3d7209c Jan Kara       2015-12-07  318  	loff_t size;
> d5be7a03b Ross Zwisler   2016-01-22  319  	int ret;
> ea3d7209c Jan Kara       2015-12-07  320  
> ea3d7209c Jan Kara       2015-12-07  321  	sb_start_pagefault(sb);
> 1ebf3e0da Dave Jiang     2017-02-02  322  	file_update_time(vmf->vma->vm_file);
> ea3d7209c Jan Kara       2015-12-07  323  	down_read(&EXT4_I(inode)->i_mmap_sem);
> ea3d7209c Jan Kara       2015-12-07  324  	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
> ea3d7209c Jan Kara       2015-12-07  325  	if (vmf->pgoff >= size)
> ea3d7209c Jan Kara       2015-12-07  326  		ret = VM_FAULT_SIGBUS;
> d5be7a03b Ross Zwisler   2016-01-22  327  	else
> 1ebf3e0da Dave Jiang     2017-02-02  328  		ret = dax_pfn_mkwrite(vmf);
> ea3d7209c Jan Kara       2015-12-07  329  	up_read(&EXT4_I(inode)->i_mmap_sem);
> ea3d7209c Jan Kara       2015-12-07  330  	sb_end_pagefault(sb);
> ea3d7209c Jan Kara       2015-12-07  331  
> ea3d7209c Jan Kara       2015-12-07  332  	return ret;
> 923ae0ff9 Ross Zwisler   2015-02-16  333  }
> 923ae0ff9 Ross Zwisler   2015-02-16  334  
> 923ae0ff9 Ross Zwisler   2015-02-16  335  static const struct vm_operations_struct ext4_dax_vm_ops = {
> 923ae0ff9 Ross Zwisler   2015-02-16  336  	.fault		= ext4_dax_fault,
> 22711acc4 Dave Jiang     2017-02-03 @337  	.huge_fault	= ext4_dax_huge_fault,
> 1e9d180ba Ross Zwisler   2016-02-27  338  	.page_mkwrite	= ext4_dax_fault,
> ea3d7209c Jan Kara       2015-12-07  339  	.pfn_mkwrite	= ext4_dax_pfn_mkwrite,
> 923ae0ff9 Ross Zwisler   2015-02-16  340  };
> 
> :::::: The code at line 280 was first introduced by commit
> :::::: 30599588c9eaccc211d383c9974a3a88dfa6e7d5 mm,fs,dax: change ->pmd_fault to ->huge_fault
> 
> :::::: TO: Dave Jiang <dave.jiang@intel.com>
> :::::: CC: Johannes Weiner <hannes@cmpxchg.org>
> 
> ---
> 0-DAY kernel test infrastructure                Open Source Technology Center
> https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
>
Dan Williams Feb. 3, 2017, 11:26 p.m. UTC | #4
On Fri, Feb 3, 2017 at 3:25 PM, Dave Jiang <dave.jiang@intel.com> wrote:
> On 02/03/2017 03:56 PM, kbuild test robot wrote:
>> Hi Dave,
>>
>> [auto build test ERROR on mmotm/master]
>> [cannot apply to linus/master linux/master v4.10-rc6 next-20170203]
>> [if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
>
> This one is a bit odd. I just pulled mmotm tree master branch and built
> with the attached .config and it passed for me (and I don't see this
> commit in the master branch). I also built linux-next with this patch on
> top and it also passes with attached .config. Looking at the err log
> below it seems the code has a mix of partial from before and after the
> patch. I'm rather confused about it....

This is a false positive. It tried to build it against latest mainline
instead of linux-next.
kernel test robot Feb. 4, 2017, 3:51 a.m. UTC | #5
Hi Dave,

[auto build test WARNING on mmotm/master]
[cannot apply to linus/master linux/master v4.10-rc6 next-20170203]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Dave-Jiang/mm-replace-FAULT_FLAG_SIZE-with-parameter-to-huge_fault/20170204-053548
base:   git://git.cmpxchg.org/linux-mmotm.git master
config: x86_64-randconfig-in0-02040556 (attached as .config)
compiler: gcc-4.6 (Debian 4.6.4-7) 4.6.4
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All warnings (new ones prefixed by >>):

   fs/ext4/file.c:280:1: error: conflicting types for 'ext4_dax_huge_fault'
   fs/ext4/file.c:258:12: note: previous definition of 'ext4_dax_huge_fault' was here
   fs/ext4/file.c: In function 'ext4_dax_huge_fault':
   fs/ext4/file.c:292:2: error: incompatible type for argument 2 of 'dax_iomap_fault'
   include/linux/dax.h:41:5: note: expected 'enum page_entry_size' but argument is of type 'struct iomap_ops *'
   fs/ext4/file.c:292:2: error: too few arguments to function 'dax_iomap_fault'
   include/linux/dax.h:41:5: note: declared here
   fs/ext4/file.c: In function 'ext4_dax_fault':
   fs/ext4/file.c:302:2: error: too many arguments to function 'ext4_dax_huge_fault'
   fs/ext4/file.c:280:1: note: declared here
   fs/ext4/file.c: At top level:
>> fs/ext4/file.c:337:2: warning: initialization from incompatible pointer type [enabled by default]
   fs/ext4/file.c:337:2: warning: (near initialization for 'ext4_dax_vm_ops.huge_fault') [enabled by default]
   fs/ext4/file.c:258:12: warning: 'ext4_dax_huge_fault' defined but not used [-Wunused-function]

vim +337 fs/ext4/file.c

   286	
   287		if (write) {
   288			sb_start_pagefault(sb);
   289			file_update_time(vmf->vma->vm_file);
   290		}
   291		down_read(&EXT4_I(inode)->i_mmap_sem);
 > 292		result = dax_iomap_fault(vmf, &ext4_iomap_ops);
   293		up_read(&EXT4_I(inode)->i_mmap_sem);
   294		if (write)
   295			sb_end_pagefault(sb);
   296	
   297		return result;
   298	}
   299	
   300	static int ext4_dax_fault(struct vm_fault *vmf)
   301	{
   302		return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
   303	}
   304	
   305	/*
   306	 * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
   307	 * handler we check for races agaist truncate. Note that since we cycle through
   308	 * i_mmap_sem, we are sure that also any hole punching that began before we
   309	 * were called is finished by now and so if it included part of the file we
   310	 * are working on, our pte will get unmapped and the check for pte_same() in
   311	 * wp_pfn_shared() fails. Thus fault gets retried and things work out as
   312	 * desired.
   313	 */
   314	static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
   315	{
   316		struct inode *inode = file_inode(vmf->vma->vm_file);
   317		struct super_block *sb = inode->i_sb;
   318		loff_t size;
   319		int ret;
   320	
   321		sb_start_pagefault(sb);
   322		file_update_time(vmf->vma->vm_file);
   323		down_read(&EXT4_I(inode)->i_mmap_sem);
   324		size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
   325		if (vmf->pgoff >= size)
   326			ret = VM_FAULT_SIGBUS;
   327		else
   328			ret = dax_pfn_mkwrite(vmf);
   329		up_read(&EXT4_I(inode)->i_mmap_sem);
   330		sb_end_pagefault(sb);
   331	
   332		return ret;
   333	}
   334	
   335	static const struct vm_operations_struct ext4_dax_vm_ops = {
   336		.fault		= ext4_dax_fault,
 > 337		.huge_fault	= ext4_dax_huge_fault,
   338		.page_mkwrite	= ext4_dax_fault,
   339		.pfn_mkwrite	= ext4_dax_pfn_mkwrite,
   340	};

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Jan Kara Feb. 6, 2017, 8:51 a.m. UTC | #6
On Fri 03-02-17 14:31:22, Dave Jiang wrote:
> Since the introduction of FAULT_FLAG_SIZE to the vm_fault flag, it has
> been somewhat painful with getting the flags set and removed at the
> correct locations. More than one kernel oops was introduced due to
> difficulties of getting the placement correctly. Removing the flag
> values and introducing an input parameter to huge_fault that indicates
> the size of the page entry. This makes the code easier to trace and
> should avoid the issues we see with the fault flags where removal of the
> flag was necessary in the fallback paths.
> 
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>

Yeah, the code looks less error-prone this way. I like it. You can add:

Reviewed-by: Jan Kara <jack@suse.cz>


								Honza
Christoph Hellwig Feb. 6, 2017, 2:36 p.m. UTC | #7
On Fri, Feb 03, 2017 at 02:31:22PM -0700, Dave Jiang wrote:
> Since the introduction of FAULT_FLAG_SIZE to the vm_fault flag, it has
> been somewhat painful with getting the flags set and removed at the
> correct locations. More than one kernel oops was introduced due to
> difficulties of getting the placement correctly. Removing the flag
> values and introducing an input parameter to huge_fault that indicates
> the size of the page entry. This makes the code easier to trace and
> should avoid the issues we see with the fault flags where removal of the
> flag was necessary in the fallback paths.

Why is this not in struct vm_fault?  Also can be use this opportunity
to fold ->huge_fault into ->fault?
Dan Williams Feb. 6, 2017, 4:24 p.m. UTC | #8
On Mon, Feb 6, 2017 at 6:36 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Fri, Feb 03, 2017 at 02:31:22PM -0700, Dave Jiang wrote:
>> Since the introduction of FAULT_FLAG_SIZE to the vm_fault flag, it has
>> been somewhat painful with getting the flags set and removed at the
>> correct locations. More than one kernel oops was introduced due to
>> difficulties of getting the placement correctly. Removing the flag
>> values and introducing an input parameter to huge_fault that indicates
>> the size of the page entry. This makes the code easier to trace and
>> should avoid the issues we see with the fault flags where removal of the
>> flag was necessary in the fallback paths.
>
> Why is this not in struct vm_fault?

Because this is easier to read and harder to get wrong. Same arguments
as getting rid of struct blk_dax_ctl.

> Also can be use this opportunity
> to fold ->huge_fault into ->fault?

Hmm, yes, just need a scheme to not attempt huge_faults on pte-only handlers.
Christoph Hellwig Feb. 6, 2017, 5:27 p.m. UTC | #9
On Mon, Feb 06, 2017 at 08:24:48AM -0800, Dan Williams wrote:
> > Also can be use this opportunity
> > to fold ->huge_fault into ->fault?
> 
> Hmm, yes, just need a scheme to not attempt huge_faults on pte-only handlers.

Do we need anything more than checking vma->vm_flags for VM_HUGETLB?
Dan Williams Feb. 6, 2017, 5:30 p.m. UTC | #10
On Mon, Feb 6, 2017 at 9:27 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Mon, Feb 06, 2017 at 08:24:48AM -0800, Dan Williams wrote:
>> > Also can be use this opportunity
>> > to fold ->huge_fault into ->fault?
>>
>> Hmm, yes, just need a scheme to not attempt huge_faults on pte-only handlers.
>
> Do we need anything more than checking vma->vm_flags for VM_HUGETLB?

s/VM_HUGETLB/VM_HUGEPAGE/

...but yes as long as we specify that a VM_HUGEPAGE handler must
minimally handle pud and pmd.
Kirill A. Shutemov Feb. 7, 2017, 8:44 a.m. UTC | #11
On Mon, Feb 06, 2017 at 09:30:22AM -0800, Dan Williams wrote:
> On Mon, Feb 6, 2017 at 9:27 AM, Christoph Hellwig <hch@infradead.org> wrote:
> > On Mon, Feb 06, 2017 at 08:24:48AM -0800, Dan Williams wrote:
> >> > Also can be use this opportunity
> >> > to fold ->huge_fault into ->fault?

BTW, for tmpfs we already use ->fault for both small and huge pages.
If ->fault returned THP, core mm look if it's possible to map the page as
huge in this particular VMA (due to size/alignment). If yes mm maps the
page with PMD, if not fallback to PTE.

I think it would be nice to do the same for DAX: filesystem provides core
mm with largest page this part of file can be mapped with (base aligned
address + lenght for DAX) and core mm sort out the rest.

> >> Hmm, yes, just need a scheme to not attempt huge_faults on pte-only handlers.
> >
> > Do we need anything more than checking vma->vm_flags for VM_HUGETLB?
> 
> s/VM_HUGETLB/VM_HUGEPAGE/
> 
> ...but yes as long as we specify that a VM_HUGEPAGE handler must
> minimally handle pud and pmd.

VM_HUGEPAGE is result of MADV_HUGEPAGE. It's not required to have THP in
the VMA.
Dan Williams Feb. 7, 2017, 4:56 p.m. UTC | #12
On Tue, Feb 7, 2017 at 12:44 AM, Kirill A. Shutemov
<kirill@shutemov.name> wrote:
> On Mon, Feb 06, 2017 at 09:30:22AM -0800, Dan Williams wrote:
>> On Mon, Feb 6, 2017 at 9:27 AM, Christoph Hellwig <hch@infradead.org> wrote:
>> > On Mon, Feb 06, 2017 at 08:24:48AM -0800, Dan Williams wrote:
>> >> > Also can be use this opportunity
>> >> > to fold ->huge_fault into ->fault?
>
> BTW, for tmpfs we already use ->fault for both small and huge pages.
> If ->fault returned THP, core mm look if it's possible to map the page as
> huge in this particular VMA (due to size/alignment). If yes mm maps the
> page with PMD, if not fallback to PTE.
>
> I think it would be nice to do the same for DAX: filesystem provides core
> mm with largest page this part of file can be mapped with (base aligned
> address + lenght for DAX) and core mm sort out the rest.

For DAX we would need plumb pfn_t into the core mm so that we have the
PFN_DEV and PFN_MAP flags beyond the raw pfn.

>
>> >> Hmm, yes, just need a scheme to not attempt huge_faults on pte-only handlers.
>> >
>> > Do we need anything more than checking vma->vm_flags for VM_HUGETLB?
>>
>> s/VM_HUGETLB/VM_HUGEPAGE/
>>
>> ...but yes as long as we specify that a VM_HUGEPAGE handler must
>> minimally handle pud and pmd.
>
> VM_HUGEPAGE is result of MADV_HUGEPAGE. It's not required to have THP in
> the VMA.

Filesystem-DAX and Device-DAX specify VM_HUGEPAGE by default.
Kirill A. Shutemov Feb. 7, 2017, 5:40 p.m. UTC | #13
On Tue, Feb 07, 2017 at 08:56:56AM -0800, Dan Williams wrote:
> On Tue, Feb 7, 2017 at 12:44 AM, Kirill A. Shutemov
> <kirill@shutemov.name> wrote:
> > On Mon, Feb 06, 2017 at 09:30:22AM -0800, Dan Williams wrote:
> >> On Mon, Feb 6, 2017 at 9:27 AM, Christoph Hellwig <hch@infradead.org> wrote:
> >> > On Mon, Feb 06, 2017 at 08:24:48AM -0800, Dan Williams wrote:
> >> >> > Also can be use this opportunity
> >> >> > to fold ->huge_fault into ->fault?
> >
> > BTW, for tmpfs we already use ->fault for both small and huge pages.
> > If ->fault returned THP, core mm look if it's possible to map the page as
> > huge in this particular VMA (due to size/alignment). If yes mm maps the
> > page with PMD, if not fallback to PTE.
> >
> > I think it would be nice to do the same for DAX: filesystem provides core
> > mm with largest page this part of file can be mapped with (base aligned
> > address + lenght for DAX) and core mm sort out the rest.
> 
> For DAX we would need plumb pfn_t into the core mm so that we have the
> PFN_DEV and PFN_MAP flags beyond the raw pfn.

Sounds good to me.

> >> >> Hmm, yes, just need a scheme to not attempt huge_faults on pte-only handlers.
> >> >
> >> > Do we need anything more than checking vma->vm_flags for VM_HUGETLB?
> >>
> >> s/VM_HUGETLB/VM_HUGEPAGE/
> >>
> >> ...but yes as long as we specify that a VM_HUGEPAGE handler must
> >> minimally handle pud and pmd.
> >
> > VM_HUGEPAGE is result of MADV_HUGEPAGE. It's not required to have THP in
> > the VMA.
> 
> Filesystem-DAX and Device-DAX specify VM_HUGEPAGE by default.

But why? Looks like abuse of the flag.
Dan Williams Feb. 7, 2017, 6:08 p.m. UTC | #14
On Tue, Feb 7, 2017 at 9:40 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
> On Tue, Feb 07, 2017 at 08:56:56AM -0800, Dan Williams wrote:
>> On Tue, Feb 7, 2017 at 12:44 AM, Kirill A. Shutemov
>> <kirill@shutemov.name> wrote:
>> > On Mon, Feb 06, 2017 at 09:30:22AM -0800, Dan Williams wrote:
>> >> On Mon, Feb 6, 2017 at 9:27 AM, Christoph Hellwig <hch@infradead.org> wrote:
>> >> > On Mon, Feb 06, 2017 at 08:24:48AM -0800, Dan Williams wrote:
>> >> >> > Also can be use this opportunity
>> >> >> > to fold ->huge_fault into ->fault?
>> >
>> > BTW, for tmpfs we already use ->fault for both small and huge pages.
>> > If ->fault returned THP, core mm look if it's possible to map the page as
>> > huge in this particular VMA (due to size/alignment). If yes mm maps the
>> > page with PMD, if not fallback to PTE.
>> >
>> > I think it would be nice to do the same for DAX: filesystem provides core
>> > mm with largest page this part of file can be mapped with (base aligned
>> > address + lenght for DAX) and core mm sort out the rest.
>>
>> For DAX we would need plumb pfn_t into the core mm so that we have the
>> PFN_DEV and PFN_MAP flags beyond the raw pfn.
>
> Sounds good to me.
>
>> >> >> Hmm, yes, just need a scheme to not attempt huge_faults on pte-only handlers.
>> >> >
>> >> > Do we need anything more than checking vma->vm_flags for VM_HUGETLB?
>> >>
>> >> s/VM_HUGETLB/VM_HUGEPAGE/
>> >>
>> >> ...but yes as long as we specify that a VM_HUGEPAGE handler must
>> >> minimally handle pud and pmd.
>> >
>> > VM_HUGEPAGE is result of MADV_HUGEPAGE. It's not required to have THP in
>> > the VMA.
>>
>> Filesystem-DAX and Device-DAX specify VM_HUGEPAGE by default.
>
> But why? Looks like abuse of the flag.

Good question, that's been there since DAX was initially added and I
don't see a good reason for it currently. I'll take a look at what you
have for huge-tmpfs support.
Jan Kara Feb. 8, 2017, 8:41 a.m. UTC | #15
On Tue 07-02-17 08:56:56, Dan Williams wrote:
> On Tue, Feb 7, 2017 at 12:44 AM, Kirill A. Shutemov
> <kirill@shutemov.name> wrote:
> > On Mon, Feb 06, 2017 at 09:30:22AM -0800, Dan Williams wrote:
> >> On Mon, Feb 6, 2017 at 9:27 AM, Christoph Hellwig <hch@infradead.org> wrote:
> >> > On Mon, Feb 06, 2017 at 08:24:48AM -0800, Dan Williams wrote:
> >> >> > Also can be use this opportunity
> >> >> > to fold ->huge_fault into ->fault?
> >
> > BTW, for tmpfs we already use ->fault for both small and huge pages.
> > If ->fault returned THP, core mm look if it's possible to map the page as
> > huge in this particular VMA (due to size/alignment). If yes mm maps the
> > page with PMD, if not fallback to PTE.
> >
> > I think it would be nice to do the same for DAX: filesystem provides core
> > mm with largest page this part of file can be mapped with (base aligned
> > address + lenght for DAX) and core mm sort out the rest.
> 
> For DAX we would need plumb pfn_t into the core mm so that we have the
> PFN_DEV and PFN_MAP flags beyond the raw pfn.

So we can pass necessary information through struct vm_fault rather easily.
However due to DAX locking we cannot really "return" pfn for generic code
to install (we need to unlock radix tree locks after modifying page
tables). So if we want generic code to handle PFNs what needs to be done is
to teach finish_fault() to handle pfn_t which is passed to it and install
it in page tables.

Long term we could transition all page fault handlers (at least the
non-trivial ones) to using finish_fault() which would IMO make the code
flow easier to follow and export less of MM internals into drivers. However
there's so many fault handlers that I didn't have a good motivation to do
that yet.

								Honza
diff mbox

Patch

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index b90bb30..b75c772 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -538,7 +538,8 @@  static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 }
 #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
-static int dax_dev_fault(struct vm_fault *vmf)
+static int dax_dev_huge_fault(struct vm_fault *vmf,
+		enum page_entry_size pe_size)
 {
 	int rc;
 	struct file *filp = vmf->vma->vm_file;
@@ -550,14 +551,14 @@  static int dax_dev_fault(struct vm_fault *vmf)
 			vmf->vma->vm_start, vmf->vma->vm_end);
 
 	rcu_read_lock();
-	switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
-	case FAULT_FLAG_SIZE_PTE:
+	switch (pe_size) {
+	case PE_SIZE_PTE:
 		rc = __dax_dev_pte_fault(dax_dev, vmf);
 		break;
-	case FAULT_FLAG_SIZE_PMD:
+	case PE_SIZE_PMD:
 		rc = __dax_dev_pmd_fault(dax_dev, vmf);
 		break;
-	case FAULT_FLAG_SIZE_PUD:
+	case PE_SIZE_PUD:
 		rc = __dax_dev_pud_fault(dax_dev, vmf);
 		break;
 	default:
@@ -568,9 +569,14 @@  static int dax_dev_fault(struct vm_fault *vmf)
 	return rc;
 }
 
+static int dax_dev_fault(struct vm_fault *vmf)
+{
+	return dax_dev_huge_fault(vmf, PE_SIZE_PTE);
+}
+
 static const struct vm_operations_struct dax_dev_vm_ops = {
 	.fault = dax_dev_fault,
-	.huge_fault = dax_dev_fault,
+	.huge_fault = dax_dev_huge_fault,
 };
 
 static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/fs/dax.c b/fs/dax.c
index 25f791d..97b8ecb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1446,12 +1446,13 @@  static int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
  * has done all the necessary locking for page fault to proceed
  * successfully.
  */
-int dax_iomap_fault(struct vm_fault *vmf, struct iomap_ops *ops)
+int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+		struct iomap_ops *ops)
 {
-	switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
-	case FAULT_FLAG_SIZE_PTE:
+	switch (pe_size) {
+	case PE_SIZE_PTE:
 		return dax_iomap_pte_fault(vmf, ops);
-	case FAULT_FLAG_SIZE_PMD:
+	case PE_SIZE_PMD:
 		return dax_iomap_pmd_fault(vmf, ops);
 	default:
 		return VM_FAULT_FALLBACK;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 6873883..b21891a 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -99,7 +99,7 @@  static int ext2_dax_fault(struct vm_fault *vmf)
 	}
 	down_read(&ei->dax_sem);
 
-	ret = dax_iomap_fault(vmf, &ext2_iomap_ops);
+	ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops);
 
 	up_read(&ei->dax_sem);
 	if (vmf->flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 51d7155..e8ab46e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -255,7 +255,8 @@  ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 }
 
 #ifdef CONFIG_FS_DAX
-static int ext4_dax_fault(struct vm_fault *vmf)
+static int ext4_dax_huge_fault(struct vm_fault *vmf,
+		enum page_entry_size pe_size)
 {
 	int result;
 	struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -267,7 +268,7 @@  static int ext4_dax_fault(struct vm_fault *vmf)
 		file_update_time(vmf->vma->vm_file);
 	}
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	result = dax_iomap_fault(vmf, &ext4_iomap_ops);
+	result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	if (write)
 		sb_end_pagefault(sb);
@@ -275,6 +276,11 @@  static int ext4_dax_fault(struct vm_fault *vmf)
 	return result;
 }
 
+static int ext4_dax_fault(struct vm_fault *vmf)
+{
+	return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
 /*
  * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
  * handler we check for races agaist truncate. Note that since we cycle through
@@ -307,7 +313,7 @@  static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
 
 static const struct vm_operations_struct ext4_dax_vm_ops = {
 	.fault		= ext4_dax_fault,
-	.huge_fault	= ext4_dax_fault,
+	.huge_fault	= ext4_dax_huge_fault,
 	.page_mkwrite	= ext4_dax_fault,
 	.pfn_mkwrite	= ext4_dax_pfn_mkwrite,
 };
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c4fe261..c37f435 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1385,7 +1385,7 @@  xfs_filemap_page_mkwrite(
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (IS_DAX(inode)) {
-		ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
 	} else {
 		ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
 		ret = block_page_mkwrite_return(ret);
@@ -1412,7 +1412,7 @@  xfs_filemap_fault(
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	if (IS_DAX(inode))
-		ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
 	else
 		ret = filemap_fault(vmf);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1429,7 +1429,8 @@  xfs_filemap_fault(
  */
 STATIC int
 xfs_filemap_huge_fault(
-	struct vm_fault		*vmf)
+	struct vm_fault		*vmf,
+	enum page_entry_size	pe_size)
 {
 	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
@@ -1446,7 +1447,7 @@  xfs_filemap_huge_fault(
 	}
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
+	ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (vmf->flags & FAULT_FLAG_WRITE)
diff --git a/include/linux/dax.h b/include/linux/dax.h
index a3bfa26..df63730 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -38,7 +38,8 @@  static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		struct iomap_ops *ops);
-int dax_iomap_fault(struct vm_fault *vmf, struct iomap_ops *ops);
+int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+		struct iomap_ops *ops);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7646ae5..7b11431 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -285,11 +285,6 @@  extern pgprot_t protection_map[16];
 #define FAULT_FLAG_REMOTE	0x80	/* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100	/* The fault was during an instruction fetch */
 
-#define FAULT_FLAG_SIZE_MASK	0x7000	/* Support up to 8-level page tables */
-#define FAULT_FLAG_SIZE_PTE	0x0000	/* First level (eg 4k) */
-#define FAULT_FLAG_SIZE_PMD	0x1000	/* Second level (eg 2MB) */
-#define FAULT_FLAG_SIZE_PUD	0x2000	/* Third level (eg 1GB) */
-
 #define FAULT_FLAG_TRACE \
 	{ FAULT_FLAG_WRITE,		"WRITE" }, \
 	{ FAULT_FLAG_MKWRITE,		"MKWRITE" }, \
@@ -349,6 +344,13 @@  struct vm_fault {
 					 */
 };
 
+/* page entry size for vm->huge_fault() */
+enum page_entry_size {
+	PE_SIZE_PTE = 0,
+	PE_SIZE_PMD,
+	PE_SIZE_PUD,
+};
+
 /*
  * These are the virtual MM functions - opening of an area, closing and
  * unmapping it (needed to keep files on disk up-to-date etc), pointer
@@ -359,7 +361,7 @@  struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	int (*mremap)(struct vm_area_struct * area);
 	int (*fault)(struct vm_fault *vmf);
-	int (*huge_fault)(struct vm_fault *vmf);
+	int (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size);
 	void (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
 
diff --git a/mm/memory.c b/mm/memory.c
index 41e2a2d..6040b74 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3489,7 +3489,7 @@  static int create_huge_pmd(struct vm_fault *vmf)
 	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_anonymous_page(vmf);
 	if (vmf->vma->vm_ops->huge_fault)
-		return vmf->vma->vm_ops->huge_fault(vmf);
+		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
 	return VM_FAULT_FALLBACK;
 }
 
@@ -3498,7 +3498,7 @@  static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_wp_page(vmf, orig_pmd);
 	if (vmf->vma->vm_ops->huge_fault)
-		return vmf->vma->vm_ops->huge_fault(vmf);
+		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
 
 	/* COW handled on pte level: split pmd */
 	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
@@ -3519,7 +3519,7 @@  static int create_huge_pud(struct vm_fault *vmf)
 	if (vma_is_anonymous(vmf->vma))
 		return VM_FAULT_FALLBACK;
 	if (vmf->vma->vm_ops->huge_fault)
-		return vmf->vma->vm_ops->huge_fault(vmf);
+		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 	return VM_FAULT_FALLBACK;
 }
@@ -3531,7 +3531,7 @@  static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
 	if (vma_is_anonymous(vmf->vma))
 		return VM_FAULT_FALLBACK;
 	if (vmf->vma->vm_ops->huge_fault)
-		return vmf->vma->vm_ops->huge_fault(vmf);
+		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 	return VM_FAULT_FALLBACK;
 }
@@ -3659,7 +3659,6 @@  static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	if (!vmf.pud)
 		return VM_FAULT_OOM;
 	if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
-		vmf.flags |= FAULT_FLAG_SIZE_PUD;
 		ret = create_huge_pud(&vmf);
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
@@ -3670,8 +3669,6 @@  static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 		if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
 			unsigned int dirty = flags & FAULT_FLAG_WRITE;
 
-			vmf.flags |= FAULT_FLAG_SIZE_PUD;
-
 			/* NUMA case for anonymous PUDs would go here */
 
 			if (dirty && !pud_write(orig_pud)) {
@@ -3689,18 +3686,14 @@  static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	if (!vmf.pmd)
 		return VM_FAULT_OOM;
 	if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
-		vmf.flags |= FAULT_FLAG_SIZE_PMD;
 		ret = create_huge_pmd(&vmf);
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
-		/* fall through path, remove PMD flag */
-		vmf.flags &= ~FAULT_FLAG_SIZE_PMD;
 	} else {
 		pmd_t orig_pmd = *vmf.pmd;
 
 		barrier();
 		if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
-			vmf.flags |= FAULT_FLAG_SIZE_PMD;
 			if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
 				return do_huge_pmd_numa_page(&vmf, orig_pmd);
 
@@ -3709,8 +3702,6 @@  static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 				ret = wp_huge_pmd(&vmf, orig_pmd);
 				if (!(ret & VM_FAULT_FALLBACK))
 					return ret;
-				/* fall through path, remove PUD flag */
-				vmf.flags &= ~FAULT_FLAG_SIZE_PUD;
 			} else {
 				huge_pmd_set_accessed(&vmf, orig_pmd);
 				return 0;