diff mbox series

[v2,4/6] mm/mremap: Use mmu gather interface instead of flush_tlb_range

Message ID 20210315113824.270796-5-aneesh.kumar@linux.ibm.com (mailing list archive)
State Superseded
Headers show
Series Speedup mremap on ppc64 | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch powerpc/merge (91966823812efbd175f904599e5cf2a854b39809)
snowpatch_ozlabs/checkpatch warning total: 0 errors, 1 warnings, 0 checks, 90 lines checked
snowpatch_ozlabs/needsstable success Patch has no Fixes tags

Commit Message

Aneesh Kumar K V March 15, 2021, 11:38 a.m. UTC
Some architectures do have the concept of page walk cache and only mmu gather
interface supports flushing them. A fast mremap that involves moving page
table pages instead of copying pte entries should flush page walk cache since
the old translation cache is no more valid. Hence switch to mm gather to flush
TLB and mark tlb.freed_tables = 1. No page table pages need to be freed here.
With this the tlb flush is done outside page table lock (ptl).

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/mremap.c | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

Comments

kernel test robot March 18, 2021, 8:22 a.m. UTC | #1
Hi "Aneesh,

I love your patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on kselftest/next v5.12-rc3 next-20210317]
[cannot apply to hnaz-linux-mm/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Aneesh-Kumar-K-V/Speedup-mremap-on-ppc64/20210315-194324
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: x86_64-rhel-8.3 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/d3b9a3e6f414413d8f822185158b937d9f19b7a6
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Aneesh-Kumar-K-V/Speedup-mremap-on-ppc64/20210315-194324
        git checkout d3b9a3e6f414413d8f822185158b937d9f19b7a6
        # save the attached .config to linux build tree
        make W=1 ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

Note: the linux-review/Aneesh-Kumar-K-V/Speedup-mremap-on-ppc64/20210315-194324 HEAD 79633714ff2b990b3e4972873457678bb34d029f builds fine.
      It only hurts bisectibility.

All errors (new ones prefixed by >>):

   mm/mremap.c: In function 'move_normal_pmd':
>> mm/mremap.c:219:20: error: storage size of 'tlb' isn't known
     219 |  struct mmu_gather tlb;
         |                    ^~~
>> mm/mremap.c:267:2: error: implicit declaration of function 'tlb_flush_pte_range' [-Werror=implicit-function-declaration]
     267 |  tlb_flush_pte_range(&tlb, old_addr, PMD_SIZE);
         |  ^~~~~~~~~~~~~~~~~~~
   mm/mremap.c:219:20: warning: unused variable 'tlb' [-Wunused-variable]
     219 |  struct mmu_gather tlb;
         |                    ^~~
   mm/mremap.c: In function 'move_normal_pud':
   mm/mremap.c:297:20: error: storage size of 'tlb' isn't known
     297 |  struct mmu_gather tlb;
         |                    ^~~
   mm/mremap.c:297:20: warning: unused variable 'tlb' [-Wunused-variable]
   cc1: some warnings being treated as errors


vim +219 mm/mremap.c

   212	
   213	#ifdef CONFIG_HAVE_MOVE_PMD
   214	static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
   215			  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
   216	{
   217		spinlock_t *old_ptl, *new_ptl;
   218		struct mm_struct *mm = vma->vm_mm;
 > 219		struct mmu_gather tlb;
   220		pmd_t pmd;
   221	
   222		/*
   223		 * The destination pmd shouldn't be established, free_pgtables()
   224		 * should have released it.
   225		 *
   226		 * However, there's a case during execve() where we use mremap
   227		 * to move the initial stack, and in that case the target area
   228		 * may overlap the source area (always moving down).
   229		 *
   230		 * If everything is PMD-aligned, that works fine, as moving
   231		 * each pmd down will clear the source pmd. But if we first
   232		 * have a few 4kB-only pages that get moved down, and then
   233		 * hit the "now the rest is PMD-aligned, let's do everything
   234		 * one pmd at a time", we will still have the old (now empty
   235		 * of any 4kB pages, but still there) PMD in the page table
   236		 * tree.
   237		 *
   238		 * Warn on it once - because we really should try to figure
   239		 * out how to do this better - but then say "I won't move
   240		 * this pmd".
   241		 *
   242		 * One alternative might be to just unmap the target pmd at
   243		 * this point, and verify that it really is empty. We'll see.
   244		 */
   245		if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
   246			return false;
   247	
   248		tlb_gather_mmu(&tlb, mm);
   249		/*
   250		 * We don't have to worry about the ordering of src and dst
   251		 * ptlocks because exclusive mmap_lock prevents deadlock.
   252		 */
   253		old_ptl = pmd_lock(mm, old_pmd);
   254		new_ptl = pmd_lockptr(mm, new_pmd);
   255		if (new_ptl != old_ptl)
   256			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
   257	
   258		/* Clear the pmd */
   259		pmd = *old_pmd;
   260		pmd_clear(old_pmd);
   261	
   262		/*
   263		 * Mark the range. We are not freeing page table pages nor
   264		 * regular pages. Hence we don't need to call tlb_remove_table()
   265		 * or tlb_remove_page().
   266		 */
 > 267		tlb_flush_pte_range(&tlb, old_addr, PMD_SIZE);
   268		tlb.freed_tables = 1;
   269		VM_BUG_ON(!pmd_none(*new_pmd));
   270		pmd_populate(mm, new_pmd, (pgtable_t)pmd_page_vaddr(pmd));
   271	
   272		if (new_ptl != old_ptl)
   273			spin_unlock(new_ptl);
   274		spin_unlock(old_ptl);
   275		/*
   276		 * This will invalidate both the old TLB and page table walk caches.
   277		 */
   278		tlb_finish_mmu(&tlb);
   279	
   280		return true;
   281	}
   282	#else
   283	static inline bool move_normal_pmd(struct vm_area_struct *vma,
   284			unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
   285			pmd_t *new_pmd)
   286	{
   287		return false;
   288	}
   289	#endif
   290	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Nicholas Piggin March 18, 2021, 8:34 a.m. UTC | #2
Excerpts from Aneesh Kumar K.V's message of March 15, 2021 9:38 pm:
> Some architectures do have the concept of page walk cache and only mmu gather
> interface supports flushing them. A fast mremap that involves moving page
> table pages instead of copying pte entries should flush page walk cache since
> the old translation cache is no more valid. Hence switch to mm gather to flush
> TLB and mark tlb.freed_tables = 1. No page table pages need to be freed here.
> With this the tlb flush is done outside page table lock (ptl).

I would maybe just get archs that implement it to provide a specific
flush_tlb+pwc_range for it, or else they get flush_tlb_range by default.

I think that would be simpler for now, at least in generic code.

There was some other talk of consolidating the TLB flush APIs, I jsut 
don't know if it's the best way to go to use the page/page table 
gathering and freeing API for it.

Thanks,
Nick

> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  mm/mremap.c | 33 +++++++++++++++++++++++++++++----
>  1 file changed, 29 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/mremap.c b/mm/mremap.c
> index 574287f9bb39..fafa73b965d3 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -216,6 +216,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>  {
>  	spinlock_t *old_ptl, *new_ptl;
>  	struct mm_struct *mm = vma->vm_mm;
> +	struct mmu_gather tlb;
>  	pmd_t pmd;
>  
>  	/*
> @@ -244,11 +245,12 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>  	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
>  		return false;
>  
> +	tlb_gather_mmu(&tlb, mm);
>  	/*
>  	 * We don't have to worry about the ordering of src and dst
>  	 * ptlocks because exclusive mmap_lock prevents deadlock.
>  	 */
> -	old_ptl = pmd_lock(vma->vm_mm, old_pmd);
> +	old_ptl = pmd_lock(mm, old_pmd);
>  	new_ptl = pmd_lockptr(mm, new_pmd);
>  	if (new_ptl != old_ptl)
>  		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
> @@ -257,13 +259,23 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>  	pmd = *old_pmd;
>  	pmd_clear(old_pmd);
>  
> +	/*
> +	 * Mark the range. We are not freeing page table pages nor
> +	 * regular pages. Hence we don't need to call tlb_remove_table()
> +	 * or tlb_remove_page().
> +	 */
> +	tlb_flush_pte_range(&tlb, old_addr, PMD_SIZE);
> +	tlb.freed_tables = 1;
>  	VM_BUG_ON(!pmd_none(*new_pmd));
>  	pmd_populate(mm, new_pmd, (pgtable_t)pmd_page_vaddr(pmd));
>  
> -	flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
>  	if (new_ptl != old_ptl)
>  		spin_unlock(new_ptl);
>  	spin_unlock(old_ptl);
> +	/*
> +	 * This will invalidate both the old TLB and page table walk caches.
> +	 */
> +	tlb_finish_mmu(&tlb);
>  
>  	return true;
>  }
> @@ -282,6 +294,7 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
>  {
>  	spinlock_t *old_ptl, *new_ptl;
>  	struct mm_struct *mm = vma->vm_mm;
> +	struct mmu_gather tlb;
>  	pud_t pud;
>  
>  	/*
> @@ -291,11 +304,12 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
>  	if (WARN_ON_ONCE(!pud_none(*new_pud)))
>  		return false;
>  
> +	tlb_gather_mmu(&tlb, mm);
>  	/*
>  	 * We don't have to worry about the ordering of src and dst
>  	 * ptlocks because exclusive mmap_lock prevents deadlock.
>  	 */
> -	old_ptl = pud_lock(vma->vm_mm, old_pud);
> +	old_ptl = pud_lock(mm, old_pud);
>  	new_ptl = pud_lockptr(mm, new_pud);
>  	if (new_ptl != old_ptl)
>  		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
> @@ -304,14 +318,25 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
>  	pud = *old_pud;
>  	pud_clear(old_pud);
>  
> +	/*
> +	 * Mark the range. We are not freeing page table pages nor
> +	 * regular pages. Hence we don't need to call tlb_remove_table()
> +	 * or tlb_remove_page().
> +	 */
> +	tlb_flush_pte_range(&tlb, old_addr, PUD_SIZE);
> +	tlb.freed_tables = 1;
>  	VM_BUG_ON(!pud_none(*new_pud));
>  
>  	pud_populate(mm, new_pud, (pmd_t *)pud_page_vaddr(pud));
> -	flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
> +
>  	if (new_ptl != old_ptl)
>  		spin_unlock(new_ptl);
>  	spin_unlock(old_ptl);
>  
> +	/*
> +	 * This will invalidate both the old TLB and page table walk caches.
> +	 */
> +	tlb_finish_mmu(&tlb);
>  	return true;
>  }
>  #else
> -- 
> 2.29.2
> 
>
diff mbox series

Patch

diff --git a/mm/mremap.c b/mm/mremap.c
index 574287f9bb39..fafa73b965d3 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -216,6 +216,7 @@  static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 {
 	spinlock_t *old_ptl, *new_ptl;
 	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_gather tlb;
 	pmd_t pmd;
 
 	/*
@@ -244,11 +245,12 @@  static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
 		return false;
 
+	tlb_gather_mmu(&tlb, mm);
 	/*
 	 * We don't have to worry about the ordering of src and dst
 	 * ptlocks because exclusive mmap_lock prevents deadlock.
 	 */
-	old_ptl = pmd_lock(vma->vm_mm, old_pmd);
+	old_ptl = pmd_lock(mm, old_pmd);
 	new_ptl = pmd_lockptr(mm, new_pmd);
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -257,13 +259,23 @@  static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	pmd = *old_pmd;
 	pmd_clear(old_pmd);
 
+	/*
+	 * Mark the range. We are not freeing page table pages nor
+	 * regular pages. Hence we don't need to call tlb_remove_table()
+	 * or tlb_remove_page().
+	 */
+	tlb_flush_pte_range(&tlb, old_addr, PMD_SIZE);
+	tlb.freed_tables = 1;
 	VM_BUG_ON(!pmd_none(*new_pmd));
 	pmd_populate(mm, new_pmd, (pgtable_t)pmd_page_vaddr(pmd));
 
-	flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
 	spin_unlock(old_ptl);
+	/*
+	 * This will invalidate both the old TLB and page table walk caches.
+	 */
+	tlb_finish_mmu(&tlb);
 
 	return true;
 }
@@ -282,6 +294,7 @@  static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 {
 	spinlock_t *old_ptl, *new_ptl;
 	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_gather tlb;
 	pud_t pud;
 
 	/*
@@ -291,11 +304,12 @@  static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 	if (WARN_ON_ONCE(!pud_none(*new_pud)))
 		return false;
 
+	tlb_gather_mmu(&tlb, mm);
 	/*
 	 * We don't have to worry about the ordering of src and dst
 	 * ptlocks because exclusive mmap_lock prevents deadlock.
 	 */
-	old_ptl = pud_lock(vma->vm_mm, old_pud);
+	old_ptl = pud_lock(mm, old_pud);
 	new_ptl = pud_lockptr(mm, new_pud);
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -304,14 +318,25 @@  static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 	pud = *old_pud;
 	pud_clear(old_pud);
 
+	/*
+	 * Mark the range. We are not freeing page table pages nor
+	 * regular pages. Hence we don't need to call tlb_remove_table()
+	 * or tlb_remove_page().
+	 */
+	tlb_flush_pte_range(&tlb, old_addr, PUD_SIZE);
+	tlb.freed_tables = 1;
 	VM_BUG_ON(!pud_none(*new_pud));
 
 	pud_populate(mm, new_pud, (pmd_t *)pud_page_vaddr(pud));
-	flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
+
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
 	spin_unlock(old_ptl);
 
+	/*
+	 * This will invalidate both the old TLB and page table walk caches.
+	 */
+	tlb_finish_mmu(&tlb);
 	return true;
 }
 #else