diff mbox

[04/17] mm: gup: make get_user_pages_fast and __get_user_pages_fast latency conscious

Message ID 1412356087-16115-5-git-send-email-aarcange@redhat.com
State New
Headers show

Commit Message

Andrea Arcangeli Oct. 3, 2014, 5:07 p.m. UTC
This teaches gup_fast and __gup_fast to re-enable irqs and
cond_resched() if possible every BATCH_PAGES.

This must be implemented by other archs as well and it's a requirement
before converting more get_user_pages() to get_user_pages_fast() as an
optimization (instead of using get_user_pages_unlocked which would be
slower).

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---
 arch/x86/mm/gup.c | 234 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 149 insertions(+), 85 deletions(-)

Comments

Andrea Arcangeli Oct. 6, 2014, 2:14 p.m. UTC | #1
Hello,

On Fri, Oct 03, 2014 at 11:23:53AM -0700, Linus Torvalds wrote:
> On Fri, Oct 3, 2014 at 10:07 AM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> > This teaches gup_fast and __gup_fast to re-enable irqs and
> > cond_resched() if possible every BATCH_PAGES.
> 
> This is disgusting.
> 
> Many (most?) __gup_fast() users just want a single page, and the
> stupid overhead of the multi-page version is already unnecessary.
> This just makes things much worse.
> 
> Quite frankly, we should make a single-page version of __gup_fast(),
> and convert existign users to use that. After that, the few multi-page
> users could have this extra latency control stuff.

Ok. I didn't think at a better way to add the latency control other
than to reduce nr_pages in a outer loop instead of altering the inner
calls, but this is what I got after implementing it... If somebody has
a cleaner way to implement the latency control stuff that's welcome
and I'd be glad to replace it.

> And yes, the single-page version of get_user_pages_fast() is actually
> latency-critical. shared futexes hit it hard, and yes, I've seen this
> in profiles.

KVM would save a few cycles from a single-page version too. I just
thought further optimizations could be added later and this was better
than nothing.

Considering I've no better idea how to implement the latency control
stuff, for now I'll just drop this controversial patch, and I'll
convert those get_user_pages to gup_unlocked instead of converting
them to gup_fast, which is more than enough to obtain the mmap_sem
holding scalability improvement (that also solves the mmap_sem trouble
for the userfaultfd). gup_unlocked isn't as good as gup_fast but it's
at least better than the current get_user_pages().

I got into this gup_fast latency control stuff purely because there
were a few get_user_pages that could have been converted to
get_user_pages_fast as they were using "current" and "current->mm" as the
first two parameters, except for the risk of disabling irq for
long. So I tried to do the right thing and fix gup_fast but I'll leave
this further optimization queued for later.

About the missing commit header for the other patch Paolo already
replied to it, to clarify this a bit further in short I expect that
FOLL_TRIED flag to be merged through the KVM git tree which already
contains it. I'll add a comment to the commit header to specify
it. Sorry for the confusion about that patch.

Thanks,
Andrea
diff mbox

Patch

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 2ab183b..917d8c1 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -12,6 +12,12 @@ 
 
 #include <asm/pgtable.h>
 
+/*
+ * Keep irq disabled for no more than BATCH_PAGES pages.
+ * Matches PTRS_PER_PTE (or half in non-PAE kernels).
+ */
+#define BATCH_PAGES	512
+
 static inline pte_t gup_get_pte(pte_t *ptep)
 {
 #ifndef CONFIG_X86_PAE
@@ -250,6 +256,40 @@  static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
 	return 1;
 }
 
+static inline int __get_user_pages_fast_batch(unsigned long start,
+					      unsigned long end,
+					      int write, struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long next;
+	unsigned long flags;
+	pgd_t *pgdp;
+	int nr = 0;
+
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables and pages from being freed on x86.
+	 *
+	 * So long as we atomically load page table pointers versus teardown
+	 * (which we do on x86, with the above PAE exception), we can follow the
+	 * address down to the the page and take a ref on it.
+	 */
+	local_irq_save(flags);
+	pgdp = pgd_offset(mm, start);
+	do {
+		pgd_t pgd = *pgdp;
+
+		next = pgd_addr_end(start, end);
+		if (pgd_none(pgd))
+			break;
+		if (!gup_pud_range(pgd, start, next, write, pages, &nr))
+			break;
+	} while (pgdp++, start = next, start != end);
+	local_irq_restore(flags);
+
+	return nr;
+}
+
 /*
  * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
  * back to the regular GUP.
@@ -257,31 +297,55 @@  static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
 int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			  struct page **pages)
 {
-	struct mm_struct *mm = current->mm;
-	unsigned long addr, len, end;
-	unsigned long next;
-	unsigned long flags;
-	pgd_t *pgdp;
-	int nr = 0;
+	unsigned long len, end, batch_pages;
+	int nr, ret;
 
 	start &= PAGE_MASK;
-	addr = start;
 	len = (unsigned long) nr_pages << PAGE_SHIFT;
 	end = start + len;
+	/*
+	 * get_user_pages() handles nr_pages == 0 gracefully, but
+	 * gup_fast starts walking the first pagetable in a do {}
+	 * while() fashion so it's not robust to handle nr_pages ==
+	 * 0. There's no point in being permissive about end < start
+	 * either. So this check verifies both nr_pages being non
+	 * zero, and that "end" didn't overflow.
+	 */
+	VM_BUG_ON(end <= start);
 	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
 					(void __user *)start, len)))
 		return 0;
 
-	/*
-	 * XXX: batch / limit 'nr', to avoid large irq off latency
-	 * needs some instrumenting to determine the common sizes used by
-	 * important workloads (eg. DB2), and whether limiting the batch size
-	 * will decrease performance.
-	 *
-	 * It seems like we're in the clear for the moment. Direct-IO is
-	 * the main guy that batches up lots of get_user_pages, and even
-	 * they are limited to 64-at-a-time which is not so many.
-	 */
+	ret = 0;
+	for (;;) {
+		batch_pages = nr_pages;
+		if (batch_pages > BATCH_PAGES && !irqs_disabled())
+			batch_pages = BATCH_PAGES;
+		len = (unsigned long) batch_pages << PAGE_SHIFT;
+		end = start + len;
+		nr = __get_user_pages_fast_batch(start, end, write, pages);
+		VM_BUG_ON(nr > batch_pages);
+		nr_pages -= nr;
+		ret += nr;
+		if (!nr_pages || nr != batch_pages)
+			break;
+		start += len;
+		pages += batch_pages;
+	}
+
+	return ret;
+}
+
+static inline int get_user_pages_fast_batch(unsigned long start,
+					    unsigned long end,
+					    int write, struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long next;
+	pgd_t *pgdp;
+	int nr = 0;
+	unsigned long orig_start = start;
+
 	/*
 	 * This doesn't prevent pagetable teardown, but does prevent
 	 * the pagetables and pages from being freed on x86.
@@ -290,18 +354,24 @@  int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	 * (which we do on x86, with the above PAE exception), we can follow the
 	 * address down to the the page and take a ref on it.
 	 */
-	local_irq_save(flags);
-	pgdp = pgd_offset(mm, addr);
+	local_irq_disable();
+	pgdp = pgd_offset(mm, start);
 	do {
 		pgd_t pgd = *pgdp;
 
-		next = pgd_addr_end(addr, end);
-		if (pgd_none(pgd))
+		next = pgd_addr_end(start, end);
+		if (pgd_none(pgd)) {
+			VM_BUG_ON(nr >= (end-orig_start) >> PAGE_SHIFT);
 			break;
-		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+		}
+		if (!gup_pud_range(pgd, start, next, write, pages, &nr)) {
+			VM_BUG_ON(nr >= (end-orig_start) >> PAGE_SHIFT);
 			break;
-	} while (pgdp++, addr = next, addr != end);
-	local_irq_restore(flags);
+		}
+	} while (pgdp++, start = next, start != end);
+	local_irq_enable();
+
+	cond_resched();
 
 	return nr;
 }
@@ -326,80 +396,74 @@  int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages)
 {
 	struct mm_struct *mm = current->mm;
-	unsigned long addr, len, end;
-	unsigned long next;
-	pgd_t *pgdp;
-	int nr = 0;
+	unsigned long len, end, batch_pages;
+	int nr, ret;
+	unsigned long orig_start;
 
 	start &= PAGE_MASK;
-	addr = start;
+	orig_start = start;
 	len = (unsigned long) nr_pages << PAGE_SHIFT;
 
 	end = start + len;
-	if (end < start)
-		goto slow_irqon;
+	/*
+	 * get_user_pages() handles nr_pages == 0 gracefully, but
+	 * gup_fast starts walking the first pagetable in a do {}
+	 * while() fashion so it's not robust to handle nr_pages ==
+	 * 0. There's no point in being permissive about end < start
+	 * either. So this check verifies both nr_pages being non
+	 * zero, and that "end" didn't overflow.
+	 */
+	VM_BUG_ON(end <= start);
 
+	nr = ret = 0;
 #ifdef CONFIG_X86_64
 	if (end >> __VIRTUAL_MASK_SHIFT)
 		goto slow_irqon;
 #endif
+	for (;;) {
+		batch_pages = min(nr_pages, BATCH_PAGES);
+		len = (unsigned long) batch_pages << PAGE_SHIFT;
+		end = start + len;
+		nr = get_user_pages_fast_batch(start, end, write, pages);
+		VM_BUG_ON(nr > batch_pages);
+		nr_pages -= nr;
+		ret += nr;
+		if (!nr_pages)
+			break;
+		if (nr < batch_pages)
+			goto slow_irqon;
+		start += len;
+		pages += batch_pages;
+	}
 
-	/*
-	 * XXX: batch / limit 'nr', to avoid large irq off latency
-	 * needs some instrumenting to determine the common sizes used by
-	 * important workloads (eg. DB2), and whether limiting the batch size
-	 * will decrease performance.
-	 *
-	 * It seems like we're in the clear for the moment. Direct-IO is
-	 * the main guy that batches up lots of get_user_pages, and even
-	 * they are limited to 64-at-a-time which is not so many.
-	 */
-	/*
-	 * This doesn't prevent pagetable teardown, but does prevent
-	 * the pagetables and pages from being freed on x86.
-	 *
-	 * So long as we atomically load page table pointers versus teardown
-	 * (which we do on x86, with the above PAE exception), we can follow the
-	 * address down to the the page and take a ref on it.
-	 */
-	local_irq_disable();
-	pgdp = pgd_offset(mm, addr);
-	do {
-		pgd_t pgd = *pgdp;
-
-		next = pgd_addr_end(addr, end);
-		if (pgd_none(pgd))
-			goto slow;
-		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
-			goto slow;
-	} while (pgdp++, addr = next, addr != end);
-	local_irq_enable();
-
-	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
-	return nr;
-
-	{
-		int ret;
+	VM_BUG_ON(ret != (end - orig_start) >> PAGE_SHIFT);
+	return ret;
 
-slow:
-		local_irq_enable();
 slow_irqon:
-		/* Try to get the remaining pages with get_user_pages */
-		start += nr << PAGE_SHIFT;
-		pages += nr;
-
-		ret = get_user_pages_unlocked(current, mm, start,
-					      (end - start) >> PAGE_SHIFT,
-					      write, 0, pages);
-
-		/* Have to be a bit careful with return values */
-		if (nr > 0) {
-			if (ret < 0)
-				ret = nr;
-			else
-				ret += nr;
-		}
+	/* Try to get the remaining pages with get_user_pages */
+	start += nr << PAGE_SHIFT;
+	pages += nr;
 
-		return ret;
+	/*
+	 * "nr" was the get_user_pages_fast_batch last retval, "ret"
+	 * was the sum of all get_user_pages_fast_batch retvals, now
+	 * "nr" becomes the sum of all get_user_pages_fast_batch
+	 * retvals and "ret" will become the get_user_pages_unlocked
+	 * retval.
+	 */
+	nr = ret;
+
+	ret = get_user_pages_unlocked(current, mm, start,
+				      (end - start) >> PAGE_SHIFT,
+				      write, 0, pages);
+
+	/* Have to be a bit careful with return values */
+	if (nr > 0) {
+		if (ret < 0)
+			ret = nr;
+		else
+			ret += nr;
 	}
+
+	return ret;
 }