diff mbox series

[v10,12/25] mm: cache some VMA fields in the vm_fault structure

Message ID 1523975611-15978-13-git-send-email-ldufour@linux.vnet.ibm.com (mailing list archive)
State Superseded
Headers show
Series Speculative page faults | expand

Commit Message

Laurent Dufour April 17, 2018, 2:33 p.m. UTC
When handling speculative page fault, the vma->vm_flags and
vma->vm_page_prot fields are read once the page table lock is released. So
there is no more guarantee that these fields would not change in our back.
They will be saved in the vm_fault structure before the VMA is checked for
changes.

This patch also set the fields in hugetlb_no_page() and
__collapse_huge_page_swapin even if it is not need for the callee.

Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
---
 include/linux/mm.h | 10 ++++++++--
 mm/huge_memory.c   |  6 +++---
 mm/hugetlb.c       |  2 ++
 mm/khugepaged.c    |  2 ++
 mm/memory.c        | 50 ++++++++++++++++++++++++++------------------------
 mm/migrate.c       |  2 +-
 6 files changed, 42 insertions(+), 30 deletions(-)

Comments

Minchan Kim April 23, 2018, 7:42 a.m. UTC | #1
On Tue, Apr 17, 2018 at 04:33:18PM +0200, Laurent Dufour wrote:
> When handling speculative page fault, the vma->vm_flags and
> vma->vm_page_prot fields are read once the page table lock is released. So
> there is no more guarantee that these fields would not change in our back.
> They will be saved in the vm_fault structure before the VMA is checked for
> changes.

Sorry. I cannot understand.
If it is changed under us, what happens? If it's critical, why cannot we
check with seqcounter?
Clearly, I'm not understanding the logic here. However, it's a global
change without CONFIG_SPF so I want to be more careful.
It would be better to describe why we need to sanpshot those values
into vm_fault rather than preventing the race.

Thanks.

> 
> This patch also set the fields in hugetlb_no_page() and
> __collapse_huge_page_swapin even if it is not need for the callee.
> 
> Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
> ---
>  include/linux/mm.h | 10 ++++++++--
>  mm/huge_memory.c   |  6 +++---
>  mm/hugetlb.c       |  2 ++
>  mm/khugepaged.c    |  2 ++
>  mm/memory.c        | 50 ++++++++++++++++++++++++++------------------------
>  mm/migrate.c       |  2 +-
>  6 files changed, 42 insertions(+), 30 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index f6edd15563bc..c65205c8c558 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -367,6 +367,12 @@ struct vm_fault {
>  					 * page table to avoid allocation from
>  					 * atomic context.
>  					 */
> +	/*
> +	 * These entries are required when handling speculative page fault.
> +	 * This way the page handling is done using consistent field values.
> +	 */
> +	unsigned long vma_flags;
> +	pgprot_t vma_page_prot;
>  };
>  
>  /* page entry size for vm->huge_fault() */
> @@ -687,9 +693,9 @@ void free_compound_page(struct page *page);
>   * pte_mkwrite.  But get_user_pages can cause write faults for mappings
>   * that do not have writing enabled, when used by access_process_vm.
>   */
> -static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
> +static inline pte_t maybe_mkwrite(pte_t pte, unsigned long vma_flags)
>  {
> -	if (likely(vma->vm_flags & VM_WRITE))
> +	if (likely(vma_flags & VM_WRITE))
>  		pte = pte_mkwrite(pte);
>  	return pte;
>  }
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index a3a1815f8e11..da2afda67e68 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1194,8 +1194,8 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
>  
>  	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
>  		pte_t entry;
> -		entry = mk_pte(pages[i], vma->vm_page_prot);
> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> +		entry = mk_pte(pages[i], vmf->vma_page_prot);
> +		entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
>  		memcg = (void *)page_private(pages[i]);
>  		set_page_private(pages[i], 0);
>  		page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
> @@ -2168,7 +2168,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  				entry = pte_swp_mksoft_dirty(entry);
>  		} else {
>  			entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
> -			entry = maybe_mkwrite(entry, vma);
> +			entry = maybe_mkwrite(entry, vma->vm_flags);
>  			if (!write)
>  				entry = pte_wrprotect(entry);
>  			if (!young)
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 218679138255..774864153407 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -3718,6 +3718,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  				.vma = vma,
>  				.address = address,
>  				.flags = flags,
> +				.vma_flags = vma->vm_flags,
> +				.vma_page_prot = vma->vm_page_prot,
>  				/*
>  				 * Hard to debug if it ends up being
>  				 * used by a callee that assumes
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 0b28af4b950d..2b02a9f9589e 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -887,6 +887,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>  		.flags = FAULT_FLAG_ALLOW_RETRY,
>  		.pmd = pmd,
>  		.pgoff = linear_page_index(vma, address),
> +		.vma_flags = vma->vm_flags,
> +		.vma_page_prot = vma->vm_page_prot,
>  	};
>  
>  	/* we only decide to swapin, if there is enough young ptes */
> diff --git a/mm/memory.c b/mm/memory.c
> index f76f5027d251..2fb9920e06a5 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1826,7 +1826,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
>  out_mkwrite:
>  	if (mkwrite) {
>  		entry = pte_mkyoung(entry);
> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> +		entry = maybe_mkwrite(pte_mkdirty(entry), vma->vm_flags);
>  	}
>  
>  	set_pte_at(mm, addr, pte, entry);
> @@ -2472,7 +2472,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
>  
>  	flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
>  	entry = pte_mkyoung(vmf->orig_pte);
> -	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> +	entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
>  	if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
>  		update_mmu_cache(vma, vmf->address, vmf->pte);
>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
> @@ -2548,8 +2548,8 @@ static int wp_page_copy(struct vm_fault *vmf)
>  			inc_mm_counter_fast(mm, MM_ANONPAGES);
>  		}
>  		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
> -		entry = mk_pte(new_page, vma->vm_page_prot);
> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> +		entry = mk_pte(new_page, vmf->vma_page_prot);
> +		entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
>  		/*
>  		 * Clear the pte entry and flush it first, before updating the
>  		 * pte with the new entry. This will avoid a race condition
> @@ -2614,7 +2614,7 @@ static int wp_page_copy(struct vm_fault *vmf)
>  		 * Don't let another task, with possibly unlocked vma,
>  		 * keep the mlocked page.
>  		 */
> -		if (page_copied && (vma->vm_flags & VM_LOCKED)) {
> +		if (page_copied && (vmf->vma_flags & VM_LOCKED)) {
>  			lock_page(old_page);	/* LRU manipulation */
>  			if (PageMlocked(old_page))
>  				munlock_vma_page(old_page);
> @@ -2650,7 +2650,7 @@ static int wp_page_copy(struct vm_fault *vmf)
>   */
>  int finish_mkwrite_fault(struct vm_fault *vmf)
>  {
> -	WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
> +	WARN_ON_ONCE(!(vmf->vma_flags & VM_SHARED));
>  	if (!pte_map_lock(vmf))
>  		return VM_FAULT_RETRY;
>  	/*
> @@ -2752,7 +2752,7 @@ static int do_wp_page(struct vm_fault *vmf)
>  		 * We should not cow pages in a shared writeable mapping.
>  		 * Just mark the pages writable and/or call ops->pfn_mkwrite.
>  		 */
> -		if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
> +		if ((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
>  				     (VM_WRITE|VM_SHARED))
>  			return wp_pfn_shared(vmf);
>  
> @@ -2799,7 +2799,7 @@ static int do_wp_page(struct vm_fault *vmf)
>  			return VM_FAULT_WRITE;
>  		}
>  		unlock_page(vmf->page);
> -	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
> +	} else if (unlikely((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
>  					(VM_WRITE|VM_SHARED))) {
>  		return wp_page_shared(vmf);
>  	}
> @@ -3078,9 +3078,9 @@ int do_swap_page(struct vm_fault *vmf)
>  
>  	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
>  	dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
> -	pte = mk_pte(page, vma->vm_page_prot);
> +	pte = mk_pte(page, vmf->vma_page_prot);
>  	if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
> -		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> +		pte = maybe_mkwrite(pte_mkdirty(pte), vmf->vma_flags);
>  		vmf->flags &= ~FAULT_FLAG_WRITE;
>  		ret |= VM_FAULT_WRITE;
>  		exclusive = RMAP_EXCLUSIVE;
> @@ -3105,7 +3105,7 @@ int do_swap_page(struct vm_fault *vmf)
>  
>  	swap_free(entry);
>  	if (mem_cgroup_swap_full(page) ||
> -	    (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
> +	    (vmf->vma_flags & VM_LOCKED) || PageMlocked(page))
>  		try_to_free_swap(page);
>  	unlock_page(page);
>  	if (page != swapcache && swapcache) {
> @@ -3163,7 +3163,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
>  	pte_t entry;
>  
>  	/* File mapping without ->vm_ops ? */
> -	if (vma->vm_flags & VM_SHARED)
> +	if (vmf->vma_flags & VM_SHARED)
>  		return VM_FAULT_SIGBUS;
>  
>  	/*
> @@ -3187,7 +3187,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
>  	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
>  			!mm_forbids_zeropage(vma->vm_mm)) {
>  		entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
> -						vma->vm_page_prot));
> +						vmf->vma_page_prot));
>  		if (!pte_map_lock(vmf))
>  			return VM_FAULT_RETRY;
>  		if (!pte_none(*vmf->pte))
> @@ -3220,8 +3220,8 @@ static int do_anonymous_page(struct vm_fault *vmf)
>  	 */
>  	__SetPageUptodate(page);
>  
> -	entry = mk_pte(page, vma->vm_page_prot);
> -	if (vma->vm_flags & VM_WRITE)
> +	entry = mk_pte(page, vmf->vma_page_prot);
> +	if (vmf->vma_flags & VM_WRITE)
>  		entry = pte_mkwrite(pte_mkdirty(entry));
>  
>  	if (!pte_map_lock(vmf)) {
> @@ -3418,7 +3418,7 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page)
>  	for (i = 0; i < HPAGE_PMD_NR; i++)
>  		flush_icache_page(vma, page + i);
>  
> -	entry = mk_huge_pmd(page, vma->vm_page_prot);
> +	entry = mk_huge_pmd(page, vmf->vma_page_prot);
>  	if (write)
>  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>  
> @@ -3492,11 +3492,11 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
>  		return VM_FAULT_NOPAGE;
>  
>  	flush_icache_page(vma, page);
> -	entry = mk_pte(page, vma->vm_page_prot);
> +	entry = mk_pte(page, vmf->vma_page_prot);
>  	if (write)
> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> +		entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
>  	/* copy-on-write page */
> -	if (write && !(vma->vm_flags & VM_SHARED)) {
> +	if (write && !(vmf->vma_flags & VM_SHARED)) {
>  		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
>  		page_add_new_anon_rmap(page, vma, vmf->address, false);
>  		mem_cgroup_commit_charge(page, memcg, false, false);
> @@ -3535,7 +3535,7 @@ int finish_fault(struct vm_fault *vmf)
>  
>  	/* Did we COW the page? */
>  	if ((vmf->flags & FAULT_FLAG_WRITE) &&
> -	    !(vmf->vma->vm_flags & VM_SHARED))
> +	    !(vmf->vma_flags & VM_SHARED))
>  		page = vmf->cow_page;
>  	else
>  		page = vmf->page;
> @@ -3789,7 +3789,7 @@ static int do_fault(struct vm_fault *vmf)
>  		ret = VM_FAULT_SIGBUS;
>  	else if (!(vmf->flags & FAULT_FLAG_WRITE))
>  		ret = do_read_fault(vmf);
> -	else if (!(vma->vm_flags & VM_SHARED))
> +	else if (!(vmf->vma_flags & VM_SHARED))
>  		ret = do_cow_fault(vmf);
>  	else
>  		ret = do_shared_fault(vmf);
> @@ -3846,7 +3846,7 @@ static int do_numa_page(struct vm_fault *vmf)
>  	 * accessible ptes, some can allow access by kernel mode.
>  	 */
>  	pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
> -	pte = pte_modify(pte, vma->vm_page_prot);
> +	pte = pte_modify(pte, vmf->vma_page_prot);
>  	pte = pte_mkyoung(pte);
>  	if (was_writable)
>  		pte = pte_mkwrite(pte);
> @@ -3880,7 +3880,7 @@ static int do_numa_page(struct vm_fault *vmf)
>  	 * Flag if the page is shared between multiple address spaces. This
>  	 * is later used when determining whether to group tasks together
>  	 */
> -	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
> +	if (page_mapcount(page) > 1 && (vmf->vma_flags & VM_SHARED))
>  		flags |= TNF_SHARED;
>  
>  	last_cpupid = page_cpupid_last(page);
> @@ -3925,7 +3925,7 @@ static inline int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
>  		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
>  
>  	/* COW handled on pte level: split pmd */
> -	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
> +	VM_BUG_ON_VMA(vmf->vma_flags & VM_SHARED, vmf->vma);
>  	__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
>  
>  	return VM_FAULT_FALLBACK;
> @@ -4072,6 +4072,8 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
>  		.flags = flags,
>  		.pgoff = linear_page_index(vma, address),
>  		.gfp_mask = __get_fault_gfp_mask(vma),
> +		.vma_flags = vma->vm_flags,
> +		.vma_page_prot = vma->vm_page_prot,
>  	};
>  	unsigned int dirty = flags & FAULT_FLAG_WRITE;
>  	struct mm_struct *mm = vma->vm_mm;
> diff --git a/mm/migrate.c b/mm/migrate.c
> index bb6367d70a3e..44d7007cfc1c 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -240,7 +240,7 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
>  		 */
>  		entry = pte_to_swp_entry(*pvmw.pte);
>  		if (is_write_migration_entry(entry))
> -			pte = maybe_mkwrite(pte, vma);
> +			pte = maybe_mkwrite(pte, vma->vm_flags);
>  
>  		if (unlikely(is_zone_device_page(new))) {
>  			if (is_device_private_page(new)) {
> -- 
> 2.7.4
>
Laurent Dufour May 3, 2018, 12:25 p.m. UTC | #2
On 23/04/2018 09:42, Minchan Kim wrote:
> On Tue, Apr 17, 2018 at 04:33:18PM +0200, Laurent Dufour wrote:
>> When handling speculative page fault, the vma->vm_flags and
>> vma->vm_page_prot fields are read once the page table lock is released. So
>> there is no more guarantee that these fields would not change in our back.
>> They will be saved in the vm_fault structure before the VMA is checked for
>> changes.
> 
> Sorry. I cannot understand.
> If it is changed under us, what happens? If it's critical, why cannot we
> check with seqcounter?
> Clearly, I'm not understanding the logic here. However, it's a global
> change without CONFIG_SPF so I want to be more careful.
> It would be better to describe why we need to sanpshot those values
> into vm_fault rather than preventing the race.

The idea is to go forward processing the page fault using the VMA's fields
values saved in the vm_fault structure. Then once the pte are locked, the
vma->sequence_counter is checked again and if something has changed in our back
the speculative page fault processing is aborted.

Thanks,
Laurent.


> 
> Thanks.
> 
>>
>> This patch also set the fields in hugetlb_no_page() and
>> __collapse_huge_page_swapin even if it is not need for the callee.
>>
>> Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
>> ---
>>  include/linux/mm.h | 10 ++++++++--
>>  mm/huge_memory.c   |  6 +++---
>>  mm/hugetlb.c       |  2 ++
>>  mm/khugepaged.c    |  2 ++
>>  mm/memory.c        | 50 ++++++++++++++++++++++++++------------------------
>>  mm/migrate.c       |  2 +-
>>  6 files changed, 42 insertions(+), 30 deletions(-)
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index f6edd15563bc..c65205c8c558 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -367,6 +367,12 @@ struct vm_fault {
>>  					 * page table to avoid allocation from
>>  					 * atomic context.
>>  					 */
>> +	/*
>> +	 * These entries are required when handling speculative page fault.
>> +	 * This way the page handling is done using consistent field values.
>> +	 */
>> +	unsigned long vma_flags;
>> +	pgprot_t vma_page_prot;
>>  };
>>  
>>  /* page entry size for vm->huge_fault() */
>> @@ -687,9 +693,9 @@ void free_compound_page(struct page *page);
>>   * pte_mkwrite.  But get_user_pages can cause write faults for mappings
>>   * that do not have writing enabled, when used by access_process_vm.
>>   */
>> -static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>> +static inline pte_t maybe_mkwrite(pte_t pte, unsigned long vma_flags)
>>  {
>> -	if (likely(vma->vm_flags & VM_WRITE))
>> +	if (likely(vma_flags & VM_WRITE))
>>  		pte = pte_mkwrite(pte);
>>  	return pte;
>>  }
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index a3a1815f8e11..da2afda67e68 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -1194,8 +1194,8 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
>>  
>>  	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
>>  		pte_t entry;
>> -		entry = mk_pte(pages[i], vma->vm_page_prot);
>> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>> +		entry = mk_pte(pages[i], vmf->vma_page_prot);
>> +		entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
>>  		memcg = (void *)page_private(pages[i]);
>>  		set_page_private(pages[i], 0);
>>  		page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
>> @@ -2168,7 +2168,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>  				entry = pte_swp_mksoft_dirty(entry);
>>  		} else {
>>  			entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
>> -			entry = maybe_mkwrite(entry, vma);
>> +			entry = maybe_mkwrite(entry, vma->vm_flags);
>>  			if (!write)
>>  				entry = pte_wrprotect(entry);
>>  			if (!young)
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 218679138255..774864153407 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -3718,6 +3718,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
>>  				.vma = vma,
>>  				.address = address,
>>  				.flags = flags,
>> +				.vma_flags = vma->vm_flags,
>> +				.vma_page_prot = vma->vm_page_prot,
>>  				/*
>>  				 * Hard to debug if it ends up being
>>  				 * used by a callee that assumes
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 0b28af4b950d..2b02a9f9589e 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -887,6 +887,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>>  		.flags = FAULT_FLAG_ALLOW_RETRY,
>>  		.pmd = pmd,
>>  		.pgoff = linear_page_index(vma, address),
>> +		.vma_flags = vma->vm_flags,
>> +		.vma_page_prot = vma->vm_page_prot,
>>  	};
>>  
>>  	/* we only decide to swapin, if there is enough young ptes */
>> diff --git a/mm/memory.c b/mm/memory.c
>> index f76f5027d251..2fb9920e06a5 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -1826,7 +1826,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
>>  out_mkwrite:
>>  	if (mkwrite) {
>>  		entry = pte_mkyoung(entry);
>> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>> +		entry = maybe_mkwrite(pte_mkdirty(entry), vma->vm_flags);
>>  	}
>>  
>>  	set_pte_at(mm, addr, pte, entry);
>> @@ -2472,7 +2472,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
>>  
>>  	flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
>>  	entry = pte_mkyoung(vmf->orig_pte);
>> -	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>> +	entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
>>  	if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
>>  		update_mmu_cache(vma, vmf->address, vmf->pte);
>>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
>> @@ -2548,8 +2548,8 @@ static int wp_page_copy(struct vm_fault *vmf)
>>  			inc_mm_counter_fast(mm, MM_ANONPAGES);
>>  		}
>>  		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
>> -		entry = mk_pte(new_page, vma->vm_page_prot);
>> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>> +		entry = mk_pte(new_page, vmf->vma_page_prot);
>> +		entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
>>  		/*
>>  		 * Clear the pte entry and flush it first, before updating the
>>  		 * pte with the new entry. This will avoid a race condition
>> @@ -2614,7 +2614,7 @@ static int wp_page_copy(struct vm_fault *vmf)
>>  		 * Don't let another task, with possibly unlocked vma,
>>  		 * keep the mlocked page.
>>  		 */
>> -		if (page_copied && (vma->vm_flags & VM_LOCKED)) {
>> +		if (page_copied && (vmf->vma_flags & VM_LOCKED)) {
>>  			lock_page(old_page);	/* LRU manipulation */
>>  			if (PageMlocked(old_page))
>>  				munlock_vma_page(old_page);
>> @@ -2650,7 +2650,7 @@ static int wp_page_copy(struct vm_fault *vmf)
>>   */
>>  int finish_mkwrite_fault(struct vm_fault *vmf)
>>  {
>> -	WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
>> +	WARN_ON_ONCE(!(vmf->vma_flags & VM_SHARED));
>>  	if (!pte_map_lock(vmf))
>>  		return VM_FAULT_RETRY;
>>  	/*
>> @@ -2752,7 +2752,7 @@ static int do_wp_page(struct vm_fault *vmf)
>>  		 * We should not cow pages in a shared writeable mapping.
>>  		 * Just mark the pages writable and/or call ops->pfn_mkwrite.
>>  		 */
>> -		if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
>> +		if ((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
>>  				     (VM_WRITE|VM_SHARED))
>>  			return wp_pfn_shared(vmf);
>>  
>> @@ -2799,7 +2799,7 @@ static int do_wp_page(struct vm_fault *vmf)
>>  			return VM_FAULT_WRITE;
>>  		}
>>  		unlock_page(vmf->page);
>> -	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
>> +	} else if (unlikely((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
>>  					(VM_WRITE|VM_SHARED))) {
>>  		return wp_page_shared(vmf);
>>  	}
>> @@ -3078,9 +3078,9 @@ int do_swap_page(struct vm_fault *vmf)
>>  
>>  	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
>>  	dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
>> -	pte = mk_pte(page, vma->vm_page_prot);
>> +	pte = mk_pte(page, vmf->vma_page_prot);
>>  	if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
>> -		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
>> +		pte = maybe_mkwrite(pte_mkdirty(pte), vmf->vma_flags);
>>  		vmf->flags &= ~FAULT_FLAG_WRITE;
>>  		ret |= VM_FAULT_WRITE;
>>  		exclusive = RMAP_EXCLUSIVE;
>> @@ -3105,7 +3105,7 @@ int do_swap_page(struct vm_fault *vmf)
>>  
>>  	swap_free(entry);
>>  	if (mem_cgroup_swap_full(page) ||
>> -	    (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
>> +	    (vmf->vma_flags & VM_LOCKED) || PageMlocked(page))
>>  		try_to_free_swap(page);
>>  	unlock_page(page);
>>  	if (page != swapcache && swapcache) {
>> @@ -3163,7 +3163,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
>>  	pte_t entry;
>>  
>>  	/* File mapping without ->vm_ops ? */
>> -	if (vma->vm_flags & VM_SHARED)
>> +	if (vmf->vma_flags & VM_SHARED)
>>  		return VM_FAULT_SIGBUS;
>>  
>>  	/*
>> @@ -3187,7 +3187,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
>>  	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
>>  			!mm_forbids_zeropage(vma->vm_mm)) {
>>  		entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
>> -						vma->vm_page_prot));
>> +						vmf->vma_page_prot));
>>  		if (!pte_map_lock(vmf))
>>  			return VM_FAULT_RETRY;
>>  		if (!pte_none(*vmf->pte))
>> @@ -3220,8 +3220,8 @@ static int do_anonymous_page(struct vm_fault *vmf)
>>  	 */
>>  	__SetPageUptodate(page);
>>  
>> -	entry = mk_pte(page, vma->vm_page_prot);
>> -	if (vma->vm_flags & VM_WRITE)
>> +	entry = mk_pte(page, vmf->vma_page_prot);
>> +	if (vmf->vma_flags & VM_WRITE)
>>  		entry = pte_mkwrite(pte_mkdirty(entry));
>>  
>>  	if (!pte_map_lock(vmf)) {
>> @@ -3418,7 +3418,7 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page)
>>  	for (i = 0; i < HPAGE_PMD_NR; i++)
>>  		flush_icache_page(vma, page + i);
>>  
>> -	entry = mk_huge_pmd(page, vma->vm_page_prot);
>> +	entry = mk_huge_pmd(page, vmf->vma_page_prot);
>>  	if (write)
>>  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>  
>> @@ -3492,11 +3492,11 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
>>  		return VM_FAULT_NOPAGE;
>>  
>>  	flush_icache_page(vma, page);
>> -	entry = mk_pte(page, vma->vm_page_prot);
>> +	entry = mk_pte(page, vmf->vma_page_prot);
>>  	if (write)
>> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>> +		entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
>>  	/* copy-on-write page */
>> -	if (write && !(vma->vm_flags & VM_SHARED)) {
>> +	if (write && !(vmf->vma_flags & VM_SHARED)) {
>>  		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
>>  		page_add_new_anon_rmap(page, vma, vmf->address, false);
>>  		mem_cgroup_commit_charge(page, memcg, false, false);
>> @@ -3535,7 +3535,7 @@ int finish_fault(struct vm_fault *vmf)
>>  
>>  	/* Did we COW the page? */
>>  	if ((vmf->flags & FAULT_FLAG_WRITE) &&
>> -	    !(vmf->vma->vm_flags & VM_SHARED))
>> +	    !(vmf->vma_flags & VM_SHARED))
>>  		page = vmf->cow_page;
>>  	else
>>  		page = vmf->page;
>> @@ -3789,7 +3789,7 @@ static int do_fault(struct vm_fault *vmf)
>>  		ret = VM_FAULT_SIGBUS;
>>  	else if (!(vmf->flags & FAULT_FLAG_WRITE))
>>  		ret = do_read_fault(vmf);
>> -	else if (!(vma->vm_flags & VM_SHARED))
>> +	else if (!(vmf->vma_flags & VM_SHARED))
>>  		ret = do_cow_fault(vmf);
>>  	else
>>  		ret = do_shared_fault(vmf);
>> @@ -3846,7 +3846,7 @@ static int do_numa_page(struct vm_fault *vmf)
>>  	 * accessible ptes, some can allow access by kernel mode.
>>  	 */
>>  	pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
>> -	pte = pte_modify(pte, vma->vm_page_prot);
>> +	pte = pte_modify(pte, vmf->vma_page_prot);
>>  	pte = pte_mkyoung(pte);
>>  	if (was_writable)
>>  		pte = pte_mkwrite(pte);
>> @@ -3880,7 +3880,7 @@ static int do_numa_page(struct vm_fault *vmf)
>>  	 * Flag if the page is shared between multiple address spaces. This
>>  	 * is later used when determining whether to group tasks together
>>  	 */
>> -	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
>> +	if (page_mapcount(page) > 1 && (vmf->vma_flags & VM_SHARED))
>>  		flags |= TNF_SHARED;
>>  
>>  	last_cpupid = page_cpupid_last(page);
>> @@ -3925,7 +3925,7 @@ static inline int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
>>  		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
>>  
>>  	/* COW handled on pte level: split pmd */
>> -	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
>> +	VM_BUG_ON_VMA(vmf->vma_flags & VM_SHARED, vmf->vma);
>>  	__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
>>  
>>  	return VM_FAULT_FALLBACK;
>> @@ -4072,6 +4072,8 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
>>  		.flags = flags,
>>  		.pgoff = linear_page_index(vma, address),
>>  		.gfp_mask = __get_fault_gfp_mask(vma),
>> +		.vma_flags = vma->vm_flags,
>> +		.vma_page_prot = vma->vm_page_prot,
>>  	};
>>  	unsigned int dirty = flags & FAULT_FLAG_WRITE;
>>  	struct mm_struct *mm = vma->vm_mm;
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index bb6367d70a3e..44d7007cfc1c 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -240,7 +240,7 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
>>  		 */
>>  		entry = pte_to_swp_entry(*pvmw.pte);
>>  		if (is_write_migration_entry(entry))
>> -			pte = maybe_mkwrite(pte, vma);
>> +			pte = maybe_mkwrite(pte, vma->vm_flags);
>>  
>>  		if (unlikely(is_zone_device_page(new))) {
>>  			if (is_device_private_page(new)) {
>> -- 
>> 2.7.4
>>
>
Minchan Kim May 3, 2018, 3:42 p.m. UTC | #3
On Thu, May 03, 2018 at 02:25:18PM +0200, Laurent Dufour wrote:
> On 23/04/2018 09:42, Minchan Kim wrote:
> > On Tue, Apr 17, 2018 at 04:33:18PM +0200, Laurent Dufour wrote:
> >> When handling speculative page fault, the vma->vm_flags and
> >> vma->vm_page_prot fields are read once the page table lock is released. So
> >> there is no more guarantee that these fields would not change in our back
> >> They will be saved in the vm_fault structure before the VMA is checked for
> >> changes.
> > 
> > Sorry. I cannot understand.
> > If it is changed under us, what happens? If it's critical, why cannot we
> > check with seqcounter?
> > Clearly, I'm not understanding the logic here. However, it's a global
> > change without CONFIG_SPF so I want to be more careful.
> > It would be better to describe why we need to sanpshot those values
> > into vm_fault rather than preventing the race.
> 
> The idea is to go forward processing the page fault using the VMA's fields
> values saved in the vm_fault structure. Then once the pte are locked, the
> vma->sequence_counter is checked again and if something has changed in our back
> the speculative page fault processing is aborted.

Sorry, still I don't understand why we should capture some fields to vm_fault.
If we found vma->seq_cnt is changed under pte lock, can't we just bail out and
fallback to classic fault handling?

Maybe, I'm missing something clear now. It would be really helpful to understand
if you give some exmaple.

Thanks.

> 
> Thanks,
> Laurent.
> 
> 
> > 
> > Thanks.
> > 
> >>
> >> This patch also set the fields in hugetlb_no_page() and
> >> __collapse_huge_page_swapin even if it is not need for the callee.
> >>
> >> Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
> >> ---
> >>  include/linux/mm.h | 10 ++++++++--
> >>  mm/huge_memory.c   |  6 +++---
> >>  mm/hugetlb.c       |  2 ++
> >>  mm/khugepaged.c    |  2 ++
> >>  mm/memory.c        | 50 ++++++++++++++++++++++++++------------------------
> >>  mm/migrate.c       |  2 +-
> >>  6 files changed, 42 insertions(+), 30 deletions(-)
> >>
> >> diff --git a/include/linux/mm.h b/include/linux/mm.h
> >> index f6edd15563bc..c65205c8c558 100644
> >> --- a/include/linux/mm.h
> >> +++ b/include/linux/mm.h
> >> @@ -367,6 +367,12 @@ struct vm_fault {
> >>  					 * page table to avoid allocation from
> >>  					 * atomic context.
> >>  					 */
> >> +	/*
> >> +	 * These entries are required when handling speculative page fault.
> >> +	 * This way the page handling is done using consistent field values.
> >> +	 */
> >> +	unsigned long vma_flags;
> >> +	pgprot_t vma_page_prot;
> >>  };
> >>  
> >>  /* page entry size for vm->huge_fault() */
> >> @@ -687,9 +693,9 @@ void free_compound_page(struct page *page);
> >>   * pte_mkwrite.  But get_user_pages can cause write faults for mappings
> >>   * that do not have writing enabled, when used by access_process_vm.
> >>   */
> >> -static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
> >> +static inline pte_t maybe_mkwrite(pte_t pte, unsigned long vma_flags)
> >>  {
> >> -	if (likely(vma->vm_flags & VM_WRITE))
> >> +	if (likely(vma_flags & VM_WRITE))
> >>  		pte = pte_mkwrite(pte);
> >>  	return pte;
> >>  }
> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index a3a1815f8e11..da2afda67e68 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -1194,8 +1194,8 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
> >>  
> >>  	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
> >>  		pte_t entry;
> >> -		entry = mk_pte(pages[i], vma->vm_page_prot);
> >> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> >> +		entry = mk_pte(pages[i], vmf->vma_page_prot);
> >> +		entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
> >>  		memcg = (void *)page_private(pages[i]);
> >>  		set_page_private(pages[i], 0);
> >>  		page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
> >> @@ -2168,7 +2168,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
> >>  				entry = pte_swp_mksoft_dirty(entry);
> >>  		} else {
> >>  			entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
> >> -			entry = maybe_mkwrite(entry, vma);
> >> +			entry = maybe_mkwrite(entry, vma->vm_flags);
> >>  			if (!write)
> >>  				entry = pte_wrprotect(entry);
> >>  			if (!young)
> >> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> >> index 218679138255..774864153407 100644
> >> --- a/mm/hugetlb.c
> >> +++ b/mm/hugetlb.c
> >> @@ -3718,6 +3718,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
> >>  				.vma = vma,
> >>  				.address = address,
> >>  				.flags = flags,
> >> +				.vma_flags = vma->vm_flags,
> >> +				.vma_page_prot = vma->vm_page_prot,
> >>  				/*
> >>  				 * Hard to debug if it ends up being
> >>  				 * used by a callee that assumes
> >> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >> index 0b28af4b950d..2b02a9f9589e 100644
> >> --- a/mm/khugepaged.c
> >> +++ b/mm/khugepaged.c
> >> @@ -887,6 +887,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
> >>  		.flags = FAULT_FLAG_ALLOW_RETRY,
> >>  		.pmd = pmd,
> >>  		.pgoff = linear_page_index(vma, address),
> >> +		.vma_flags = vma->vm_flags,
> >> +		.vma_page_prot = vma->vm_page_prot,
> >>  	};
> >>  
> >>  	/* we only decide to swapin, if there is enough young ptes */
> >> diff --git a/mm/memory.c b/mm/memory.c
> >> index f76f5027d251..2fb9920e06a5 100644
> >> --- a/mm/memory.c
> >> +++ b/mm/memory.c
> >> @@ -1826,7 +1826,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
> >>  out_mkwrite:
> >>  	if (mkwrite) {
> >>  		entry = pte_mkyoung(entry);
> >> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> >> +		entry = maybe_mkwrite(pte_mkdirty(entry), vma->vm_flags);
> >>  	}
> >>  
> >>  	set_pte_at(mm, addr, pte, entry);
> >> @@ -2472,7 +2472,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
> >>  
> >>  	flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
> >>  	entry = pte_mkyoung(vmf->orig_pte);
> >> -	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> >> +	entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
> >>  	if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
> >>  		update_mmu_cache(vma, vmf->address, vmf->pte);
> >>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
> >> @@ -2548,8 +2548,8 @@ static int wp_page_copy(struct vm_fault *vmf)
> >>  			inc_mm_counter_fast(mm, MM_ANONPAGES);
> >>  		}
> >>  		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
> >> -		entry = mk_pte(new_page, vma->vm_page_prot);
> >> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> >> +		entry = mk_pte(new_page, vmf->vma_page_prot);
> >> +		entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
> >>  		/*
> >>  		 * Clear the pte entry and flush it first, before updating the
> >>  		 * pte with the new entry. This will avoid a race condition
> >> @@ -2614,7 +2614,7 @@ static int wp_page_copy(struct vm_fault *vmf)
> >>  		 * Don't let another task, with possibly unlocked vma,
> >>  		 * keep the mlocked page.
> >>  		 */
> >> -		if (page_copied && (vma->vm_flags & VM_LOCKED)) {
> >> +		if (page_copied && (vmf->vma_flags & VM_LOCKED)) {
> >>  			lock_page(old_page);	/* LRU manipulation */
> >>  			if (PageMlocked(old_page))
> >>  				munlock_vma_page(old_page);
> >> @@ -2650,7 +2650,7 @@ static int wp_page_copy(struct vm_fault *vmf)
> >>   */
> >>  int finish_mkwrite_fault(struct vm_fault *vmf)
> >>  {
> >> -	WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
> >> +	WARN_ON_ONCE(!(vmf->vma_flags & VM_SHARED));
> >>  	if (!pte_map_lock(vmf))
> >>  		return VM_FAULT_RETRY;
> >>  	/*
> >> @@ -2752,7 +2752,7 @@ static int do_wp_page(struct vm_fault *vmf)
> >>  		 * We should not cow pages in a shared writeable mapping.
> >>  		 * Just mark the pages writable and/or call ops->pfn_mkwrite.
> >>  		 */
> >> -		if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
> >> +		if ((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
> >>  				     (VM_WRITE|VM_SHARED))
> >>  			return wp_pfn_shared(vmf);
> >>  
> >> @@ -2799,7 +2799,7 @@ static int do_wp_page(struct vm_fault *vmf)
> >>  			return VM_FAULT_WRITE;
> >>  		}
> >>  		unlock_page(vmf->page);
> >> -	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
> >> +	} else if (unlikely((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
> >>  					(VM_WRITE|VM_SHARED))) {
> >>  		return wp_page_shared(vmf);
> >>  	}
> >> @@ -3078,9 +3078,9 @@ int do_swap_page(struct vm_fault *vmf)
> >>  
> >>  	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
> >>  	dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
> >> -	pte = mk_pte(page, vma->vm_page_prot);
> >> +	pte = mk_pte(page, vmf->vma_page_prot);
> >>  	if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
> >> -		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> >> +		pte = maybe_mkwrite(pte_mkdirty(pte), vmf->vma_flags);
> >>  		vmf->flags &= ~FAULT_FLAG_WRITE;
> >>  		ret |= VM_FAULT_WRITE;
> >>  		exclusive = RMAP_EXCLUSIVE;
> >> @@ -3105,7 +3105,7 @@ int do_swap_page(struct vm_fault *vmf)
> >>  
> >>  	swap_free(entry);
> >>  	if (mem_cgroup_swap_full(page) ||
> >> -	    (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
> >> +	    (vmf->vma_flags & VM_LOCKED) || PageMlocked(page))
> >>  		try_to_free_swap(page);
> >>  	unlock_page(page);
> >>  	if (page != swapcache && swapcache) {
> >> @@ -3163,7 +3163,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
> >>  	pte_t entry;
> >>  
> >>  	/* File mapping without ->vm_ops ? */
> >> -	if (vma->vm_flags & VM_SHARED)
> >> +	if (vmf->vma_flags & VM_SHARED)
> >>  		return VM_FAULT_SIGBUS;
> >>  
> >>  	/*
> >> @@ -3187,7 +3187,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
> >>  	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
> >>  			!mm_forbids_zeropage(vma->vm_mm)) {
> >>  		entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
> >> -						vma->vm_page_prot));
> >> +						vmf->vma_page_prot));
> >>  		if (!pte_map_lock(vmf))
> >>  			return VM_FAULT_RETRY;
> >>  		if (!pte_none(*vmf->pte))
> >> @@ -3220,8 +3220,8 @@ static int do_anonymous_page(struct vm_fault *vmf)
> >>  	 */
> >>  	__SetPageUptodate(page);
> >>  
> >> -	entry = mk_pte(page, vma->vm_page_prot);
> >> -	if (vma->vm_flags & VM_WRITE)
> >> +	entry = mk_pte(page, vmf->vma_page_prot);
> >> +	if (vmf->vma_flags & VM_WRITE)
> >>  		entry = pte_mkwrite(pte_mkdirty(entry));
> >>  
> >>  	if (!pte_map_lock(vmf)) {
> >> @@ -3418,7 +3418,7 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page)
> >>  	for (i = 0; i < HPAGE_PMD_NR; i++)
> >>  		flush_icache_page(vma, page + i);
> >>  
> >> -	entry = mk_huge_pmd(page, vma->vm_page_prot);
> >> +	entry = mk_huge_pmd(page, vmf->vma_page_prot);
> >>  	if (write)
> >>  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> >>  
> >> @@ -3492,11 +3492,11 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
> >>  		return VM_FAULT_NOPAGE;
> >>  
> >>  	flush_icache_page(vma, page);
> >> -	entry = mk_pte(page, vma->vm_page_prot);
> >> +	entry = mk_pte(page, vmf->vma_page_prot);
> >>  	if (write)
> >> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> >> +		entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
> >>  	/* copy-on-write page */
> >> -	if (write && !(vma->vm_flags & VM_SHARED)) {
> >> +	if (write && !(vmf->vma_flags & VM_SHARED)) {
> >>  		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
> >>  		page_add_new_anon_rmap(page, vma, vmf->address, false);
> >>  		mem_cgroup_commit_charge(page, memcg, false, false);
> >> @@ -3535,7 +3535,7 @@ int finish_fault(struct vm_fault *vmf)
> >>  
> >>  	/* Did we COW the page? */
> >>  	if ((vmf->flags & FAULT_FLAG_WRITE) &&
> >> -	    !(vmf->vma->vm_flags & VM_SHARED))
> >> +	    !(vmf->vma_flags & VM_SHARED))
> >>  		page = vmf->cow_page;
> >>  	else
> >>  		page = vmf->page;
> >> @@ -3789,7 +3789,7 @@ static int do_fault(struct vm_fault *vmf)
> >>  		ret = VM_FAULT_SIGBUS;
> >>  	else if (!(vmf->flags & FAULT_FLAG_WRITE))
> >>  		ret = do_read_fault(vmf);
> >> -	else if (!(vma->vm_flags & VM_SHARED))
> >> +	else if (!(vmf->vma_flags & VM_SHARED))
> >>  		ret = do_cow_fault(vmf);
> >>  	else
> >>  		ret = do_shared_fault(vmf);
> >> @@ -3846,7 +3846,7 @@ static int do_numa_page(struct vm_fault *vmf)
> >>  	 * accessible ptes, some can allow access by kernel mode.
> >>  	 */
> >>  	pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
> >> -	pte = pte_modify(pte, vma->vm_page_prot);
> >> +	pte = pte_modify(pte, vmf->vma_page_prot);
> >>  	pte = pte_mkyoung(pte);
> >>  	if (was_writable)
> >>  		pte = pte_mkwrite(pte);
> >> @@ -3880,7 +3880,7 @@ static int do_numa_page(struct vm_fault *vmf)
> >>  	 * Flag if the page is shared between multiple address spaces. This
> >>  	 * is later used when determining whether to group tasks together
> >>  	 */
> >> -	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
> >> +	if (page_mapcount(page) > 1 && (vmf->vma_flags & VM_SHARED))
> >>  		flags |= TNF_SHARED;
> >>  
> >>  	last_cpupid = page_cpupid_last(page);
> >> @@ -3925,7 +3925,7 @@ static inline int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
> >>  		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
> >>  
> >>  	/* COW handled on pte level: split pmd */
> >> -	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
> >> +	VM_BUG_ON_VMA(vmf->vma_flags & VM_SHARED, vmf->vma);
> >>  	__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
> >>  
> >>  	return VM_FAULT_FALLBACK;
> >> @@ -4072,6 +4072,8 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
> >>  		.flags = flags,
> >>  		.pgoff = linear_page_index(vma, address),
> >>  		.gfp_mask = __get_fault_gfp_mask(vma),
> >> +		.vma_flags = vma->vm_flags,
> >> +		.vma_page_prot = vma->vm_page_prot,
> >>  	};
> >>  	unsigned int dirty = flags & FAULT_FLAG_WRITE;
> >>  	struct mm_struct *mm = vma->vm_mm;
> >> diff --git a/mm/migrate.c b/mm/migrate.c
> >> index bb6367d70a3e..44d7007cfc1c 100644
> >> --- a/mm/migrate.c
> >> +++ b/mm/migrate.c
> >> @@ -240,7 +240,7 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
> >>  		 */
> >>  		entry = pte_to_swp_entry(*pvmw.pte);
> >>  		if (is_write_migration_entry(entry))
> >> -			pte = maybe_mkwrite(pte, vma);
> >> +			pte = maybe_mkwrite(pte, vma->vm_flags);
> >>  
> >>  		if (unlikely(is_zone_device_page(new))) {
> >>  			if (is_device_private_page(new)) {
> >> -- 
> >> 2.7.4
> >>
> > 
>
Laurent Dufour May 4, 2018, 9:10 a.m. UTC | #4
On 03/05/2018 17:42, Minchan Kim wrote:
> On Thu, May 03, 2018 at 02:25:18PM +0200, Laurent Dufour wrote:
>> On 23/04/2018 09:42, Minchan Kim wrote:
>>> On Tue, Apr 17, 2018 at 04:33:18PM +0200, Laurent Dufour wrote:
>>>> When handling speculative page fault, the vma->vm_flags and
>>>> vma->vm_page_prot fields are read once the page table lock is released. So
>>>> there is no more guarantee that these fields would not change in our back
>>>> They will be saved in the vm_fault structure before the VMA is checked for
>>>> changes.
>>>
>>> Sorry. I cannot understand.
>>> If it is changed under us, what happens? If it's critical, why cannot we
>>> check with seqcounter?
>>> Clearly, I'm not understanding the logic here. However, it's a global
>>> change without CONFIG_SPF so I want to be more careful.
>>> It would be better to describe why we need to sanpshot those values
>>> into vm_fault rather than preventing the race.
>>
>> The idea is to go forward processing the page fault using the VMA's fields
>> values saved in the vm_fault structure. Then once the pte are locked, the
>> vma->sequence_counter is checked again and if something has changed in our back
>> the speculative page fault processing is aborted.
> 
> Sorry, still I don't understand why we should capture some fields to vm_fault.
> If we found vma->seq_cnt is changed under pte lock, can't we just bail out and
> fallback to classic fault handling?
> 
> Maybe, I'm missing something clear now. It would be really helpful to understand
> if you give some exmaple.

I'd rather say that I was not clear enough ;)

Here is the point, when we deal with a speculative page fault, the mmap_sem is
not taken, so parallel VMA's changes can occurred. When a VMA change is done
which will impact the page fault processing, we assumed that the VMA sequence
counter will be changed.

In the page fault processing, at the time the PTE is locked, we checked the VMA
sequence counter to detect changes done in our back. If no change is detected
we can continue further. But this doesn't prevent the VMA to not be changed in
our back while the PTE is locked. So VMA's fields which are used while the PTE
is locked must be saved to ensure that we are using *static* values.
This is important since the PTE changes will be made on regards to these VMA
fields and they need to be consistent. This concerns the vma->vm_flags and
vma->vm_page_prot VMA fields.

I hope I make this clear enough this time.

Thanks,
Laurent.
Minchan Kim May 8, 2018, 10:56 a.m. UTC | #5
On Fri, May 04, 2018 at 11:10:54AM +0200, Laurent Dufour wrote:
> On 03/05/2018 17:42, Minchan Kim wrote:
> > On Thu, May 03, 2018 at 02:25:18PM +0200, Laurent Dufour wrote:
> >> On 23/04/2018 09:42, Minchan Kim wrote:
> >>> On Tue, Apr 17, 2018 at 04:33:18PM +0200, Laurent Dufour wrote:
> >>>> When handling speculative page fault, the vma->vm_flags and
> >>>> vma->vm_page_prot fields are read once the page table lock is released. So
> >>>> there is no more guarantee that these fields would not change in our back
> >>>> They will be saved in the vm_fault structure before the VMA is checked for
> >>>> changes.
> >>>
> >>> Sorry. I cannot understand.
> >>> If it is changed under us, what happens? If it's critical, why cannot we
> >>> check with seqcounter?
> >>> Clearly, I'm not understanding the logic here. However, it's a global
> >>> change without CONFIG_SPF so I want to be more careful.
> >>> It would be better to describe why we need to sanpshot those values
> >>> into vm_fault rather than preventing the race.
> >>
> >> The idea is to go forward processing the page fault using the VMA's fields
> >> values saved in the vm_fault structure. Then once the pte are locked, the
> >> vma->sequence_counter is checked again and if something has changed in our back
> >> the speculative page fault processing is aborted.
> > 
> > Sorry, still I don't understand why we should capture some fields to vm_fault.
> > If we found vma->seq_cnt is changed under pte lock, can't we just bail out and
> > fallback to classic fault handling?
> > 
> > Maybe, I'm missing something clear now. It would be really helpful to understand
> > if you give some exmaple.
> 
> I'd rather say that I was not clear enough ;)
> 
> Here is the point, when we deal with a speculative page fault, the mmap_sem is
> not taken, so parallel VMA's changes can occurred. When a VMA change is done
> which will impact the page fault processing, we assumed that the VMA sequence
> counter will be changed.
> 
> In the page fault processing, at the time the PTE is locked, we checked the VMA
> sequence counter to detect changes done in our back. If no change is detected
> we can continue further. But this doesn't prevent the VMA to not be changed in
> our back while the PTE is locked. So VMA's fields which are used while the PTE
> is locked must be saved to ensure that we are using *static* values.
> This is important since the PTE changes will be made on regards to these VMA
> fields and they need to be consistent. This concerns the vma->vm_flags and
> vma->vm_page_prot VMA fields.
> 
> I hope I make this clear enough this time.

It's more clear at this point. Please include such nice explanation in description.
Now, I am wondering how you synchronize those static value and vma's seqcount.
It must be in next patchset. I hope to grab a time to read it, asap.

Thanks.
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f6edd15563bc..c65205c8c558 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -367,6 +367,12 @@  struct vm_fault {
 					 * page table to avoid allocation from
 					 * atomic context.
 					 */
+	/*
+	 * These entries are required when handling speculative page fault.
+	 * This way the page handling is done using consistent field values.
+	 */
+	unsigned long vma_flags;
+	pgprot_t vma_page_prot;
 };
 
 /* page entry size for vm->huge_fault() */
@@ -687,9 +693,9 @@  void free_compound_page(struct page *page);
  * pte_mkwrite.  But get_user_pages can cause write faults for mappings
  * that do not have writing enabled, when used by access_process_vm.
  */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+static inline pte_t maybe_mkwrite(pte_t pte, unsigned long vma_flags)
 {
-	if (likely(vma->vm_flags & VM_WRITE))
+	if (likely(vma_flags & VM_WRITE))
 		pte = pte_mkwrite(pte);
 	return pte;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a3a1815f8e11..da2afda67e68 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1194,8 +1194,8 @@  static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
 
 	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
 		pte_t entry;
-		entry = mk_pte(pages[i], vma->vm_page_prot);
-		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		entry = mk_pte(pages[i], vmf->vma_page_prot);
+		entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
 		memcg = (void *)page_private(pages[i]);
 		set_page_private(pages[i], 0);
 		page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
@@ -2168,7 +2168,7 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 				entry = pte_swp_mksoft_dirty(entry);
 		} else {
 			entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
-			entry = maybe_mkwrite(entry, vma);
+			entry = maybe_mkwrite(entry, vma->vm_flags);
 			if (!write)
 				entry = pte_wrprotect(entry);
 			if (!young)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 218679138255..774864153407 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3718,6 +3718,8 @@  static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 				.vma = vma,
 				.address = address,
 				.flags = flags,
+				.vma_flags = vma->vm_flags,
+				.vma_page_prot = vma->vm_page_prot,
 				/*
 				 * Hard to debug if it ends up being
 				 * used by a callee that assumes
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 0b28af4b950d..2b02a9f9589e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -887,6 +887,8 @@  static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 		.flags = FAULT_FLAG_ALLOW_RETRY,
 		.pmd = pmd,
 		.pgoff = linear_page_index(vma, address),
+		.vma_flags = vma->vm_flags,
+		.vma_page_prot = vma->vm_page_prot,
 	};
 
 	/* we only decide to swapin, if there is enough young ptes */
diff --git a/mm/memory.c b/mm/memory.c
index f76f5027d251..2fb9920e06a5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1826,7 +1826,7 @@  static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 out_mkwrite:
 	if (mkwrite) {
 		entry = pte_mkyoung(entry);
-		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		entry = maybe_mkwrite(pte_mkdirty(entry), vma->vm_flags);
 	}
 
 	set_pte_at(mm, addr, pte, entry);
@@ -2472,7 +2472,7 @@  static inline void wp_page_reuse(struct vm_fault *vmf)
 
 	flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
 	entry = pte_mkyoung(vmf->orig_pte);
-	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+	entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
 	if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
 		update_mmu_cache(vma, vmf->address, vmf->pte);
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2548,8 +2548,8 @@  static int wp_page_copy(struct vm_fault *vmf)
 			inc_mm_counter_fast(mm, MM_ANONPAGES);
 		}
 		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
-		entry = mk_pte(new_page, vma->vm_page_prot);
-		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		entry = mk_pte(new_page, vmf->vma_page_prot);
+		entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
 		/*
 		 * Clear the pte entry and flush it first, before updating the
 		 * pte with the new entry. This will avoid a race condition
@@ -2614,7 +2614,7 @@  static int wp_page_copy(struct vm_fault *vmf)
 		 * Don't let another task, with possibly unlocked vma,
 		 * keep the mlocked page.
 		 */
-		if (page_copied && (vma->vm_flags & VM_LOCKED)) {
+		if (page_copied && (vmf->vma_flags & VM_LOCKED)) {
 			lock_page(old_page);	/* LRU manipulation */
 			if (PageMlocked(old_page))
 				munlock_vma_page(old_page);
@@ -2650,7 +2650,7 @@  static int wp_page_copy(struct vm_fault *vmf)
  */
 int finish_mkwrite_fault(struct vm_fault *vmf)
 {
-	WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
+	WARN_ON_ONCE(!(vmf->vma_flags & VM_SHARED));
 	if (!pte_map_lock(vmf))
 		return VM_FAULT_RETRY;
 	/*
@@ -2752,7 +2752,7 @@  static int do_wp_page(struct vm_fault *vmf)
 		 * We should not cow pages in a shared writeable mapping.
 		 * Just mark the pages writable and/or call ops->pfn_mkwrite.
 		 */
-		if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+		if ((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
 				     (VM_WRITE|VM_SHARED))
 			return wp_pfn_shared(vmf);
 
@@ -2799,7 +2799,7 @@  static int do_wp_page(struct vm_fault *vmf)
 			return VM_FAULT_WRITE;
 		}
 		unlock_page(vmf->page);
-	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+	} else if (unlikely((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
 					(VM_WRITE|VM_SHARED))) {
 		return wp_page_shared(vmf);
 	}
@@ -3078,9 +3078,9 @@  int do_swap_page(struct vm_fault *vmf)
 
 	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
 	dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
-	pte = mk_pte(page, vma->vm_page_prot);
+	pte = mk_pte(page, vmf->vma_page_prot);
 	if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
-		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+		pte = maybe_mkwrite(pte_mkdirty(pte), vmf->vma_flags);
 		vmf->flags &= ~FAULT_FLAG_WRITE;
 		ret |= VM_FAULT_WRITE;
 		exclusive = RMAP_EXCLUSIVE;
@@ -3105,7 +3105,7 @@  int do_swap_page(struct vm_fault *vmf)
 
 	swap_free(entry);
 	if (mem_cgroup_swap_full(page) ||
-	    (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+	    (vmf->vma_flags & VM_LOCKED) || PageMlocked(page))
 		try_to_free_swap(page);
 	unlock_page(page);
 	if (page != swapcache && swapcache) {
@@ -3163,7 +3163,7 @@  static int do_anonymous_page(struct vm_fault *vmf)
 	pte_t entry;
 
 	/* File mapping without ->vm_ops ? */
-	if (vma->vm_flags & VM_SHARED)
+	if (vmf->vma_flags & VM_SHARED)
 		return VM_FAULT_SIGBUS;
 
 	/*
@@ -3187,7 +3187,7 @@  static int do_anonymous_page(struct vm_fault *vmf)
 	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
 			!mm_forbids_zeropage(vma->vm_mm)) {
 		entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
-						vma->vm_page_prot));
+						vmf->vma_page_prot));
 		if (!pte_map_lock(vmf))
 			return VM_FAULT_RETRY;
 		if (!pte_none(*vmf->pte))
@@ -3220,8 +3220,8 @@  static int do_anonymous_page(struct vm_fault *vmf)
 	 */
 	__SetPageUptodate(page);
 
-	entry = mk_pte(page, vma->vm_page_prot);
-	if (vma->vm_flags & VM_WRITE)
+	entry = mk_pte(page, vmf->vma_page_prot);
+	if (vmf->vma_flags & VM_WRITE)
 		entry = pte_mkwrite(pte_mkdirty(entry));
 
 	if (!pte_map_lock(vmf)) {
@@ -3418,7 +3418,7 @@  static int do_set_pmd(struct vm_fault *vmf, struct page *page)
 	for (i = 0; i < HPAGE_PMD_NR; i++)
 		flush_icache_page(vma, page + i);
 
-	entry = mk_huge_pmd(page, vma->vm_page_prot);
+	entry = mk_huge_pmd(page, vmf->vma_page_prot);
 	if (write)
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 
@@ -3492,11 +3492,11 @@  int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
 		return VM_FAULT_NOPAGE;
 
 	flush_icache_page(vma, page);
-	entry = mk_pte(page, vma->vm_page_prot);
+	entry = mk_pte(page, vmf->vma_page_prot);
 	if (write)
-		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
 	/* copy-on-write page */
-	if (write && !(vma->vm_flags & VM_SHARED)) {
+	if (write && !(vmf->vma_flags & VM_SHARED)) {
 		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
 		page_add_new_anon_rmap(page, vma, vmf->address, false);
 		mem_cgroup_commit_charge(page, memcg, false, false);
@@ -3535,7 +3535,7 @@  int finish_fault(struct vm_fault *vmf)
 
 	/* Did we COW the page? */
 	if ((vmf->flags & FAULT_FLAG_WRITE) &&
-	    !(vmf->vma->vm_flags & VM_SHARED))
+	    !(vmf->vma_flags & VM_SHARED))
 		page = vmf->cow_page;
 	else
 		page = vmf->page;
@@ -3789,7 +3789,7 @@  static int do_fault(struct vm_fault *vmf)
 		ret = VM_FAULT_SIGBUS;
 	else if (!(vmf->flags & FAULT_FLAG_WRITE))
 		ret = do_read_fault(vmf);
-	else if (!(vma->vm_flags & VM_SHARED))
+	else if (!(vmf->vma_flags & VM_SHARED))
 		ret = do_cow_fault(vmf);
 	else
 		ret = do_shared_fault(vmf);
@@ -3846,7 +3846,7 @@  static int do_numa_page(struct vm_fault *vmf)
 	 * accessible ptes, some can allow access by kernel mode.
 	 */
 	pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
-	pte = pte_modify(pte, vma->vm_page_prot);
+	pte = pte_modify(pte, vmf->vma_page_prot);
 	pte = pte_mkyoung(pte);
 	if (was_writable)
 		pte = pte_mkwrite(pte);
@@ -3880,7 +3880,7 @@  static int do_numa_page(struct vm_fault *vmf)
 	 * Flag if the page is shared between multiple address spaces. This
 	 * is later used when determining whether to group tasks together
 	 */
-	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
+	if (page_mapcount(page) > 1 && (vmf->vma_flags & VM_SHARED))
 		flags |= TNF_SHARED;
 
 	last_cpupid = page_cpupid_last(page);
@@ -3925,7 +3925,7 @@  static inline int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
 
 	/* COW handled on pte level: split pmd */
-	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
+	VM_BUG_ON_VMA(vmf->vma_flags & VM_SHARED, vmf->vma);
 	__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
 
 	return VM_FAULT_FALLBACK;
@@ -4072,6 +4072,8 @@  static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 		.flags = flags,
 		.pgoff = linear_page_index(vma, address),
 		.gfp_mask = __get_fault_gfp_mask(vma),
+		.vma_flags = vma->vm_flags,
+		.vma_page_prot = vma->vm_page_prot,
 	};
 	unsigned int dirty = flags & FAULT_FLAG_WRITE;
 	struct mm_struct *mm = vma->vm_mm;
diff --git a/mm/migrate.c b/mm/migrate.c
index bb6367d70a3e..44d7007cfc1c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -240,7 +240,7 @@  static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 		 */
 		entry = pte_to_swp_entry(*pvmw.pte);
 		if (is_write_migration_entry(entry))
-			pte = maybe_mkwrite(pte, vma);
+			pte = maybe_mkwrite(pte, vma->vm_flags);
 
 		if (unlikely(is_zone_device_page(new))) {
 			if (is_device_private_page(new)) {