diff mbox series

[v6,16/24] mm: Protect mm_rb tree with a rwlock

Message ID 1515777968-867-17-git-send-email-ldufour@linux.vnet.ibm.com (mailing list archive)
State Not Applicable
Headers show
Series Speculative page faults | expand

Commit Message

Laurent Dufour Jan. 12, 2018, 5:26 p.m. UTC
This change is inspired by the Peter's proposal patch [1] which was
protecting the VMA using SRCU. Unfortunately, SRCU is not scaling well in
that particular case, and it is introducing major performance degradation
due to excessive scheduling operations.

To allow access to the mm_rb tree without grabbing the mmap_sem, this patch
is protecting it access using a rwlock.  As the mm_rb tree is a O(log n)
search it is safe to protect it using such a lock.  The VMA cache is not
protected by the new rwlock and it should not be used without holding the
mmap_sem.

To allow the picked VMA structure to be used once the rwlock is released, a
use count is added to the VMA structure. When the VMA is allocated it is
set to 1.  Each time the VMA is picked with the rwlock held its use count
is incremented. Each time the VMA is released it is decremented. When the
use count hits zero, this means that the VMA is no more used and should be
freed.

This patch is preparing for 2 kind of VMA access :
 - as usual, under the control of the mmap_sem,
 - without holding the mmap_sem for the speculative page fault handler.

Access done under the control the mmap_sem doesn't require to grab the
rwlock to protect read access to the mm_rb tree, but access in write must
be done under the protection of the rwlock too. This affects inserting and
removing of elements in the RB tree.

The patch is introducing 2 new functions:
 - vma_get() to find a VMA based on an address by holding the new rwlock.
 - vma_put() to release the VMA when its no more used.
These services are designed to be used when access are made to the RB tree
without holding the mmap_sem.

When a VMA is removed from the RB tree, its vma->vm_rb field is cleared and
we rely on the WMB done when releasing the rwlock to serialize the write
with the RMB done in a later patch to check for the VMA's validity.

When free_vma is called, the file associated with the VMA is closed
immediately, but the policy and the file structure remained in used until
the VMA's use count reach 0, which may happens later when exiting an
in progress speculative page fault.

[1] https://patchwork.kernel.org/patch/5108281/

Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/mm_types.h |   4 ++
 kernel/fork.c            |   3 ++
 mm/init-mm.c             |   3 ++
 mm/internal.h            |   6 +++
 mm/mmap.c                | 120 ++++++++++++++++++++++++++++++++++-------------
 5 files changed, 104 insertions(+), 32 deletions(-)

Comments

Matthew Wilcox (Oracle) Jan. 12, 2018, 6:48 p.m. UTC | #1
On Fri, Jan 12, 2018 at 06:26:00PM +0100, Laurent Dufour wrote:
> -static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
> +static void __vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm)
>  {
> +	struct rb_root *root = &mm->mm_rb;
>  	/*
>  	 * Note rb_erase_augmented is a fairly large inline function,
>  	 * so make sure we instantiate it only once with our desired
>  	 * augmented rbtree callbacks.
>  	 */
> +#ifdef CONFIG_SPF
> +	write_lock(&mm->mm_rb_lock);
> +#endif
>  	rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
> +#ifdef CONFIG_SPF
> +	write_unlock(&mm->mm_rb_lock); /* wmb */
> +#endif

I can't say I love this.  Have you considered:

#ifdef CONFIG_SPF
#define vma_rb_write_lock(mm)	write_lock(&mm->mm_rb_lock)
#define vma_rb_write_unlock(mm)	write_unlock(&mm->mm_rb_lock)
#else
#define vma_rb_write_lock(mm)	do { } while (0)
#define vma_rb_write_unlock(mm)	do { } while (0)
#endif

Also, SPF is kind of uninformative.  CONFIG_MM_SPF might be better?
Or perhaps even CONFIG_SPECULATIVE_PAGE_FAULT, just to make it really
painful to do these one-liner ifdefs that make the code so hard to read.
Laurent Dufour Jan. 15, 2018, 5:42 p.m. UTC | #2
Hi Matthew,

Thanks for reviewing this series.

On 12/01/2018 19:48, Matthew Wilcox wrote:
> On Fri, Jan 12, 2018 at 06:26:00PM +0100, Laurent Dufour wrote:
>> -static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
>> +static void __vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm)
>>  {
>> +	struct rb_root *root = &mm->mm_rb;
>>  	/*
>>  	 * Note rb_erase_augmented is a fairly large inline function,
>>  	 * so make sure we instantiate it only once with our desired
>>  	 * augmented rbtree callbacks.
>>  	 */
>> +#ifdef CONFIG_SPF
>> +	write_lock(&mm->mm_rb_lock);
>> +#endif
>>  	rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
>> +#ifdef CONFIG_SPF
>> +	write_unlock(&mm->mm_rb_lock); /* wmb */
>> +#endif
> 
> I can't say I love this.  Have you considered:
> 
> #ifdef CONFIG_SPF
> #define vma_rb_write_lock(mm)	write_lock(&mm->mm_rb_lock)
> #define vma_rb_write_unlock(mm)	write_unlock(&mm->mm_rb_lock)
> #else
> #define vma_rb_write_lock(mm)	do { } while (0)
> #define vma_rb_write_unlock(mm)	do { } while (0)
> #endif

I haven't consider this, but this sounds to be smarter. I'll do that.

> Also, SPF is kind of uninformative.  CONFIG_MM_SPF might be better?
> Or perhaps even CONFIG_SPECULATIVE_PAGE_FAULT, just to make it really
> painful to do these one-liner ifdefs that make the code so hard to read.

Thomas also complained about that, and I agree, SPF is quite cryptic. This
being said, I don't think that CONFIG_MM_SPF will be far better, so I'll
change this define to CONFIG_SPECULATIVE_PAGE_FAULT, even if it's longer,
it should not be too much present in the code.

Thanks,
Laurent.
diff mbox series

Patch

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e0e3df3b9641..2684df7e7294 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -335,6 +335,7 @@  struct vm_area_struct {
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 #ifdef CONFIG_SPF
 	seqcount_t vm_sequence;
+	atomic_t vm_ref_count;		/* see vma_get(), vma_put() */
 #endif
 } __randomize_layout;
 
@@ -353,6 +354,9 @@  struct kioctx_table;
 struct mm_struct {
 	struct vm_area_struct *mmap;		/* list of VMAs */
 	struct rb_root mm_rb;
+#ifdef CONFIG_SPF
+	rwlock_t mm_rb_lock;
+#endif
 	u32 vmacache_seqnum;                   /* per-thread vmacache */
 #ifdef CONFIG_MMU
 	unsigned long (*get_unmapped_area) (struct file *filp,
diff --git a/kernel/fork.c b/kernel/fork.c
index 0914307d4f3b..d99606e1e9ba 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -898,6 +898,9 @@  static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm->mmap = NULL;
 	mm->mm_rb = RB_ROOT;
 	mm->vmacache_seqnum = 0;
+#ifdef CONFIG_SPF
+	rwlock_init(&mm->mm_rb_lock);
+#endif
 	atomic_set(&mm->mm_users, 1);
 	atomic_set(&mm->mm_count, 1);
 	init_rwsem(&mm->mmap_sem);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index f94d5d15ebc0..aaa5d7851d87 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -17,6 +17,9 @@ 
 
 struct mm_struct init_mm = {
 	.mm_rb		= RB_ROOT,
+#ifdef CONFIG_SPF
+	.mm_rb_lock	= __RW_LOCK_UNLOCKED(init_mm.mm_rb_lock),
+#endif
 	.pgd		= swapper_pg_dir,
 	.mm_users	= ATOMIC_INIT(2),
 	.mm_count	= ATOMIC_INIT(1),
diff --git a/mm/internal.h b/mm/internal.h
index 62d8c34e63d5..4b9c3357bd6c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -40,6 +40,12 @@  void page_writeback_init(void);
 
 int do_swap_page(struct vm_fault *vmf);
 
+#ifdef CONFIG_SPF
+extern struct vm_area_struct *get_vma(struct mm_struct *mm,
+				      unsigned long addr);
+extern void put_vma(struct vm_area_struct *vma);
+#endif
+
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 		unsigned long floor, unsigned long ceiling);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 960e2f16ffcf..972ddee0b151 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -160,6 +160,27 @@  void unlink_file_vma(struct vm_area_struct *vma)
 	}
 }
 
+static void __free_vma(struct vm_area_struct *vma)
+{
+	if (vma->vm_file)
+		fput(vma->vm_file);
+	mpol_put(vma_policy(vma));
+	kmem_cache_free(vm_area_cachep, vma);
+}
+
+#ifdef CONFIG_SPF
+void put_vma(struct vm_area_struct *vma)
+{
+	if (atomic_dec_and_test(&vma->vm_ref_count))
+		__free_vma(vma);
+}
+#else
+static inline void put_vma(struct vm_area_struct *vma)
+{
+	return __free_vma(vma);
+}
+#endif
+
 /*
  * Close a vm structure and free it, returning the next.
  */
@@ -170,10 +191,7 @@  static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	might_sleep();
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
-	if (vma->vm_file)
-		fput(vma->vm_file);
-	mpol_put(vma_policy(vma));
-	kmem_cache_free(vm_area_cachep, vma);
+	put_vma(vma);
 	return next;
 }
 
@@ -411,26 +429,41 @@  static void vma_gap_update(struct vm_area_struct *vma)
 }
 
 static inline void vma_rb_insert(struct vm_area_struct *vma,
-				 struct rb_root *root)
+				 struct mm_struct *mm)
 {
+	struct rb_root *root = &mm->mm_rb;
+
 	/* All rb_subtree_gap values must be consistent prior to insertion */
 	validate_mm_rb(root, NULL);
 
 	rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
 }
 
-static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
+static void __vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm)
 {
+	struct rb_root *root = &mm->mm_rb;
 	/*
 	 * Note rb_erase_augmented is a fairly large inline function,
 	 * so make sure we instantiate it only once with our desired
 	 * augmented rbtree callbacks.
 	 */
+#ifdef CONFIG_SPF
+	write_lock(&mm->mm_rb_lock);
+#endif
 	rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
+#ifdef CONFIG_SPF
+	write_unlock(&mm->mm_rb_lock); /* wmb */
+#endif
+
+	/*
+	 * Ensure the removal is complete before clearing the node.
+	 * Matched by vma_has_changed()/handle_speculative_fault().
+	 */
+	RB_CLEAR_NODE(&vma->vm_rb);
 }
 
 static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
-						struct rb_root *root,
+						struct mm_struct *mm,
 						struct vm_area_struct *ignore)
 {
 	/*
@@ -438,21 +471,21 @@  static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
 	 * with the possible exception of the "next" vma being erased if
 	 * next->vm_start was reduced.
 	 */
-	validate_mm_rb(root, ignore);
+	validate_mm_rb(&mm->mm_rb, ignore);
 
-	__vma_rb_erase(vma, root);
+	__vma_rb_erase(vma, mm);
 }
 
 static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
-					 struct rb_root *root)
+					 struct mm_struct *mm)
 {
 	/*
 	 * All rb_subtree_gap values must be consistent prior to erase,
 	 * with the possible exception of the vma being erased.
 	 */
-	validate_mm_rb(root, vma);
+	validate_mm_rb(&mm->mm_rb, vma);
 
-	__vma_rb_erase(vma, root);
+	__vma_rb_erase(vma, mm);
 }
 
 /*
@@ -558,10 +591,6 @@  void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
 	else
 		mm->highest_vm_end = vm_end_gap(vma);
 
-#ifdef CONFIG_SPF
-	seqcount_init(&vma->vm_sequence);
-#endif
-
 	/*
 	 * vma->vm_prev wasn't known when we followed the rbtree to find the
 	 * correct insertion point for that vma. As a result, we could not
@@ -571,10 +600,17 @@  void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * immediately update the gap to the correct value. Finally we
 	 * rebalance the rbtree after all augmented values have been set.
 	 */
+#ifdef CONFIG_SPF
+	atomic_set(&vma->vm_ref_count, 1);
+	write_lock(&mm->mm_rb_lock);
+#endif
 	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
 	vma->rb_subtree_gap = 0;
 	vma_gap_update(vma);
-	vma_rb_insert(vma, &mm->mm_rb);
+	vma_rb_insert(vma, mm);
+#ifdef CONFIG_SPF
+	write_unlock(&mm->mm_rb_lock);
+#endif
 }
 
 static void __vma_link_file(struct vm_area_struct *vma)
@@ -650,7 +686,7 @@  static __always_inline void __vma_unlink_common(struct mm_struct *mm,
 {
 	struct vm_area_struct *next;
 
-	vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
+	vma_rb_erase_ignore(vma, mm, ignore);
 	next = vma->vm_next;
 	if (has_prev)
 		prev->vm_next = next;
@@ -923,16 +959,13 @@  int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	}
 
 	if (remove_next) {
-		if (file) {
+		if (file)
 			uprobe_munmap(next, next->vm_start, next->vm_end);
-			fput(file);
-		}
 		if (next->anon_vma)
 			anon_vma_merge(vma, next);
 		mm->map_count--;
-		mpol_put(vma_policy(next));
 		vm_raw_write_end(next);
-		kmem_cache_free(vm_area_cachep, next);
+		put_vma(next);
 		/*
 		 * In mprotect's case 6 (see comments on vma_merge),
 		 * we must remove another next too. It would clutter
@@ -2182,15 +2215,11 @@  get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 EXPORT_SYMBOL(get_unmapped_area);
 
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+static struct vm_area_struct *__find_vma(struct mm_struct *mm,
+					 unsigned long addr)
 {
 	struct rb_node *rb_node;
-	struct vm_area_struct *vma;
-
-	/* Check the cache first. */
-	vma = vmacache_find(mm, addr);
-	if (likely(vma))
-		return vma;
+	struct vm_area_struct *vma = NULL;
 
 	rb_node = mm->mm_rb.rb_node;
 
@@ -2208,13 +2237,40 @@  struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 			rb_node = rb_node->rb_right;
 	}
 
+	return vma;
+}
+
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+	struct vm_area_struct *vma;
+
+	/* Check the cache first. */
+	vma = vmacache_find(mm, addr);
+	if (likely(vma))
+		return vma;
+
+	vma = __find_vma(mm, addr);
 	if (vma)
 		vmacache_update(addr, vma);
 	return vma;
 }
-
 EXPORT_SYMBOL(find_vma);
 
+#ifdef CONFIG_SPF
+struct vm_area_struct *get_vma(struct mm_struct *mm, unsigned long addr)
+{
+	struct vm_area_struct *vma = NULL;
+
+	read_lock(&mm->mm_rb_lock);
+	vma = __find_vma(mm, addr);
+	if (vma)
+		atomic_inc(&vma->vm_ref_count);
+	read_unlock(&mm->mm_rb_lock);
+
+	return vma;
+}
+#endif
+
 /*
  * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
  */
@@ -2582,7 +2638,7 @@  detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
 	vma->vm_prev = NULL;
 	do {
-		vma_rb_erase(vma, &mm->mm_rb);
+		vma_rb_erase(vma, mm);
 		mm->map_count--;
 		tail_vma = vma;
 		vma = vma->vm_next;