diff mbox

[v2] sparc64: Reduce TLB flushes during hugepage unmap

Message ID 1454540423-67071-1-git-send-email-nitin.m.gupta@oracle.com
State Changes Requested
Delegated to: David Miller
Headers show

Commit Message

Nitin Gupta Feb. 3, 2016, 11 p.m. UTC
During hugepage unmap, TSB and TLB flushes are currently
issued at every PAGE_SIZE'd boundary which is unnecessary.
We now issue the flush at REAL_HPAGE_SIZE boundaries only.

Without this patch workloads which unmap a large hugepage
backed VMA region get CPU lockups due to excessive TLB
flush calls.

Signed-off-by: Nitin Gupta <nitin.m.gupta@oracle.com>
---
 arch/sparc/include/asm/tlbflush_64.h |  3 ++-
 arch/sparc/mm/hugetlbpage.c          |  7 ++++++-
 arch/sparc/mm/tlb.c                  |  2 +-
 arch/sparc/mm/tsb.c                  | 21 ++++++++++++++-------
 4 files changed, 23 insertions(+), 10 deletions(-)

Changelog v1 vs v2:
 - Access PTEs in order (David Miller)
 - Issue TLB and TSB flush after clearing PTEs (David Miller)

Comments

Nitin Gupta Feb. 9, 2016, 6:33 p.m. UTC | #1
Gentle reminder for review comments.
Also adding Naoya to CC whom I forgot last time, sorry.

Thanks,
Nitin


On 02/03/2016 03:00 PM, Nitin Gupta wrote:
> During hugepage unmap, TSB and TLB flushes are currently
> issued at every PAGE_SIZE'd boundary which is unnecessary.
> We now issue the flush at REAL_HPAGE_SIZE boundaries only.
> 
> Without this patch workloads which unmap a large hugepage
> backed VMA region get CPU lockups due to excessive TLB
> flush calls.
> 
> Signed-off-by: Nitin Gupta <nitin.m.gupta@oracle.com>
> ---
>  arch/sparc/include/asm/tlbflush_64.h |  3 ++-
>  arch/sparc/mm/hugetlbpage.c          |  7 ++++++-
>  arch/sparc/mm/tlb.c                  |  2 +-
>  arch/sparc/mm/tsb.c                  | 21 ++++++++++++++-------
>  4 files changed, 23 insertions(+), 10 deletions(-)
> 
> Changelog v1 vs v2:
>  - Access PTEs in order (David Miller)
>  - Issue TLB and TSB flush after clearing PTEs (David Miller)
>  
> diff --git a/arch/sparc/include/asm/tlbflush_64.h b/arch/sparc/include/asm/tlbflush_64.h
> index dea1cfa..5c28f78 100644
> --- a/arch/sparc/include/asm/tlbflush_64.h
> +++ b/arch/sparc/include/asm/tlbflush_64.h
> @@ -16,7 +16,8 @@ struct tlb_batch {
>  
>  void flush_tsb_kernel_range(unsigned long start, unsigned long end);
>  void flush_tsb_user(struct tlb_batch *tb);
> -void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr);
> +void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr,
> +			 bool is_huge);
>  
>  /* TLB flush operations. */
>  
> diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
> index 131eaf4..d8f625c 100644
> --- a/arch/sparc/mm/hugetlbpage.c
> +++ b/arch/sparc/mm/hugetlbpage.c
> @@ -202,10 +202,15 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
>  	addr &= HPAGE_MASK;
>  
>  	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
> -		pte_clear(mm, addr, ptep);
> +		*ptep = __pte(0UL);
>  		addr += PAGE_SIZE;
>  		ptep++;
>  	}
> +	/* Issue TSB and TLB flush at REAL_HPAGE_SIZE boundaries */
> +	flush_tsb_user_page(mm, addr, true);
> +	flush_tsb_user_page(mm, addr + REAL_HPAGE_SIZE, true);
> +	global_flush_tlb_page(mm, addr);
> +	global_flush_tlb_page(mm, addr + REAL_HPAGE_SIZE);
>  
>  	return entry;
>  }
> diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
> index 9df2190..1a5de57 100644
> --- a/arch/sparc/mm/tlb.c
> +++ b/arch/sparc/mm/tlb.c
> @@ -84,7 +84,7 @@ static void tlb_batch_add_one(struct mm_struct *mm, unsigned long vaddr,
>  	}
>  
>  	if (!tb->active) {
> -		flush_tsb_user_page(mm, vaddr);
> +		flush_tsb_user_page(mm, vaddr, false);
>  		global_flush_tlb_page(mm, vaddr);
>  		goto out;
>  	}
> diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c
> index a065766..3af2848 100644
> --- a/arch/sparc/mm/tsb.c
> +++ b/arch/sparc/mm/tsb.c
> @@ -94,18 +94,25 @@ void flush_tsb_user(struct tlb_batch *tb)
>  	spin_unlock_irqrestore(&mm->context.lock, flags);
>  }
>  
> -void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr)
> +/*
> + * @is_huge is true if the page containing @vaddr is guaranteed to
> + * be a huge page. If false, then TSBs for both, base and huge page
> + * size are flushed.
> + */
> +void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr,
> +			 bool is_huge)
>  {
>  	unsigned long nentries, base, flags;
>  
>  	spin_lock_irqsave(&mm->context.lock, flags);
>  
> -	base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb;
> -	nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries;
> -	if (tlb_type == cheetah_plus || tlb_type == hypervisor)
> -		base = __pa(base);
> -	__flush_tsb_one_entry(base, vaddr, PAGE_SHIFT, nentries);
> -
> +	if (!is_huge) {
> +		base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb;
> +		nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries;
> +		if (tlb_type == cheetah_plus || tlb_type == hypervisor)
> +			base = __pa(base);
> +		__flush_tsb_one_entry(base, vaddr, PAGE_SHIFT, nentries);
> +	}
>  #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
>  	if (mm->context.tsb_block[MM_TSB_HUGE].tsb) {
>  		base = (unsigned long) mm->context.tsb_block[MM_TSB_HUGE].tsb;
> 
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Feb. 9, 2016, 10:12 p.m. UTC | #2
From: Nitin Gupta <nitin.m.gupta@oracle.com>
Date: Tue, 9 Feb 2016 10:33:42 -0800

> Gentle reminder for review comments.
> Also adding Naoya to CC whom I forgot last time, sorry.

I'm travelling, conferencing, and giving a keynote on networking.

I will therefore probably not be able to fully deal with this for at
least another week, sorry.

If a patch is in patchwork, I know about it, it's on my TODO list,
and you never need to send me a reminder.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller March 21, 2016, 4:28 a.m. UTC | #3
From: Nitin Gupta <nitin.m.gupta@oracle.com>
Date: Wed,  3 Feb 2016 15:00:23 -0800

> During hugepage unmap, TSB and TLB flushes are currently
> issued at every PAGE_SIZE'd boundary which is unnecessary.
> We now issue the flush at REAL_HPAGE_SIZE boundaries only.
> 
> Without this patch workloads which unmap a large hugepage
> backed VMA region get CPU lockups due to excessive TLB
> flush calls.
> 
> Signed-off-by: Nitin Gupta <nitin.m.gupta@oracle.com>

I thought a lot about this stuff tonight, and I think we need to be
more intelligent about this.

Doing a synchronous flush unconditionally is not good.  In particular,
we aren't even checking if the original PTE was mapped or not which
is going to be the most common case when a new mapping is created.

Also, we can't skip the D-cache flushes that older cpus need, as done
by tlb_batch_add().

Therefore let's teach the TLB batcher what we're actually trying to do
and what the optimization is, instead of trying so hard to bypass it
altogether.

In asm/pgtable_64.h provide is_hugetlb_pte(), I'd implement it like
this:

static inline unsigned long __pte_huge_mask(void)
{
	unsigned long mask;

	__asm__ __volatile__(
	"\n661:	sethi		%%uhi(%1), %0\n"
	"	sllx		%0, 32, %0\n"
	"	.section	.sun4v_2insn_patch, \"ax\"\n"
	"	.word		661b\n"
	"	mov		%2, %0\n"
	"	nop\n"
	"	.previous\n"
	: "=r" (mask)
	: "i" (_PAGE_SZHUGE_4U), "i" (_PAGE_SZHUGE_4V));

	return mask;
}

Then pte_mkhuge() becomes:

static inline pte_t pte_mkhuge(pte_t pte)
{
	return __pte(pte_val(pte) | __pte_huge_mask());
}

and then:

static inline bool is_hugetlb_pte(pte_t pte)
{
	return (pte_val(pte) & __pte_huge_mask());
}

And then in tlb_batch_add() can detect if the orignal PTE is huge:

	bool huge = is_hugetlb_pte(orig);

and then the end of the function is:

	if (huge & (vaddr & (REAL_HPAGE_SIZE - 1)))
		return;
	if (!fullmm)
		tlb_batch_add_one(mm, vaddr, pte_exec(orig), huge);

and tlb_batch_add_one() takes 'huge' and uses it to drive the flushing.

For a synchronous flush, we pass it down to flush_tsb_user_page().

For a batched flush we store it in tlb_batch, and any time 'huge'
changes, we do a flush_tlb_pending(), just the same as if tb->tlb_nr
hit TLB_BATCH_NR.

Then flush_tsb_user() uses 'tb->huge' to decide whether to flush
MM_TSB_BASE or not.
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nitin Gupta March 22, 2016, 1:47 a.m. UTC | #4
On 03/20/2016 09:28 PM, David Miller wrote:
> From: Nitin Gupta <nitin.m.gupta@oracle.com>
> Date: Wed,  3 Feb 2016 15:00:23 -0800
> 
>> During hugepage unmap, TSB and TLB flushes are currently
>> issued at every PAGE_SIZE'd boundary which is unnecessary.
>> We now issue the flush at REAL_HPAGE_SIZE boundaries only.
>>
>> Without this patch workloads which unmap a large hugepage
>> backed VMA region get CPU lockups due to excessive TLB
>> flush calls.
>>
>> Signed-off-by: Nitin Gupta <nitin.m.gupta@oracle.com>
> 
> I thought a lot about this stuff tonight, and I think we need to be
> more intelligent about this.
> 
> Doing a synchronous flush unconditionally is not good.  In particular,
> we aren't even checking if the original PTE was mapped or not which
> is going to be the most common case when a new mapping is created.
> 
> Also, we can't skip the D-cache flushes that older cpus need, as done
> by tlb_batch_add().
> 
> Therefore let's teach the TLB batcher what we're actually trying to do
> and what the optimization is, instead of trying so hard to bypass it
> altogether.
> 
> In asm/pgtable_64.h provide is_hugetlb_pte(), I'd implement it like
> this:
> 
> static inline unsigned long __pte_huge_mask(void)
> {
> 	unsigned long mask;
> 
> 	__asm__ __volatile__(
> 	"\n661:	sethi		%%uhi(%1), %0\n"
> 	"	sllx		%0, 32, %0\n"
> 	"	.section	.sun4v_2insn_patch, \"ax\"\n"
> 	"	.word		661b\n"
> 	"	mov		%2, %0\n"
> 	"	nop\n"
> 	"	.previous\n"
> 	: "=r" (mask)
> 	: "i" (_PAGE_SZHUGE_4U), "i" (_PAGE_SZHUGE_4V));
> 
> 	return mask;
> }
> 
> Then pte_mkhuge() becomes:
> 
> static inline pte_t pte_mkhuge(pte_t pte)
> {
> 	return __pte(pte_val(pte) | __pte_huge_mask());
> }
> 
> and then:
> 
> static inline bool is_hugetlb_pte(pte_t pte)
> {
> 	return (pte_val(pte) & __pte_huge_mask());
> }
> 
> And then in tlb_batch_add() can detect if the orignal PTE is huge:
> 
> 	bool huge = is_hugetlb_pte(orig);
> 
> and then the end of the function is:
> 
> 	if (huge & (vaddr & (REAL_HPAGE_SIZE - 1)))
> 		return;
> 	if (!fullmm)
> 		tlb_batch_add_one(mm, vaddr, pte_exec(orig), huge);
> 
> and tlb_batch_add_one() takes 'huge' and uses it to drive the flushing.
> 
> For a synchronous flush, we pass it down to flush_tsb_user_page().
> 
> For a batched flush we store it in tlb_batch, and any time 'huge'
> changes, we do a flush_tlb_pending(), just the same as if tb->tlb_nr
> hit TLB_BATCH_NR.
> 
> Then flush_tsb_user() uses 'tb->huge' to decide whether to flush
> MM_TSB_BASE or not.
> 

Thanks for detailed notes.  I will now combine my last two patches
which fix TLB flushing on PTE change and zero'ing out resp., along
with changes you suggested and send a v2 soon.

Thanks,
Nitin

--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nitin Gupta March 25, 2016, 11:55 p.m. UTC | #5
On 03/20/2016 09:28 PM, David Miller wrote:
> From: Nitin Gupta <nitin.m.gupta@oracle.com>
> Date: Wed,  3 Feb 2016 15:00:23 -0800
> 
>> During hugepage unmap, TSB and TLB flushes are currently
>> issued at every PAGE_SIZE'd boundary which is unnecessary.
>> We now issue the flush at REAL_HPAGE_SIZE boundaries only.
>>
>> Without this patch workloads which unmap a large hugepage
>> backed VMA region get CPU lockups due to excessive TLB
>> flush calls.
>>
>> Signed-off-by: Nitin Gupta <nitin.m.gupta@oracle.com>
> 
> I thought a lot about this stuff tonight, and I think we need to be
> more intelligent about this.
> 
> Doing a synchronous flush unconditionally is not good.  In particular,
> we aren't even checking if the original PTE was mapped or not which
> is going to be the most common case when a new mapping is created.
> 
> Also, we can't skip the D-cache flushes that older cpus need, as done
> by tlb_batch_add().
> 
> Therefore let's teach the TLB batcher what we're actually trying to do
> and what the optimization is, instead of trying so hard to bypass it
> altogether.
> 
> In asm/pgtable_64.h provide is_hugetlb_pte(), I'd implement it like
> this:
> 
> static inline unsigned long __pte_huge_mask(void)
> {
> 	unsigned long mask;
> 
> 	__asm__ __volatile__(
> 	"\n661:	sethi		%%uhi(%1), %0\n"
> 	"	sllx		%0, 32, %0\n"
> 	"	.section	.sun4v_2insn_patch, \"ax\"\n"
> 	"	.word		661b\n"
> 	"	mov		%2, %0\n"
> 	"	nop\n"
> 	"	.previous\n"
> 	: "=r" (mask)
> 	: "i" (_PAGE_SZHUGE_4U), "i" (_PAGE_SZHUGE_4V));
> 
> 	return mask;
> }
> 
> Then pte_mkhuge() becomes:
> 
> static inline pte_t pte_mkhuge(pte_t pte)
> {
> 	return __pte(pte_val(pte) | __pte_huge_mask());
> }
> 
> and then:
> 
> static inline bool is_hugetlb_pte(pte_t pte)
> {
> 	return (pte_val(pte) & __pte_huge_mask());
> }
> 
> And then in tlb_batch_add() can detect if the orignal PTE is huge:
> 
> 	bool huge = is_hugetlb_pte(orig);
> 
> and then the end of the function is:
> 
> 	if (huge & (vaddr & (REAL_HPAGE_SIZE - 1)))
> 		return;

Wouldn't this cause a race:
 - PTE for REAL_HPAGE_SIZE aligned vaddr is cleared
 - huge-TSB, TLB flushes issued (or scheduled) for this vaddr
 - another thread accesses PTE for vaddr + PAGE_SIZE
   - TSB, TLB again created for aligned vaddr which is then never cleared.

So, I think the solution is just clear/set hugetlb pte in loop
by direct assignment and after the loop, call set_pte_at() for base
addr and addr + REAL_HPAGE_SIZE

This is ugly but this problem would disappear when I'm done with
pagetable trimming patch which allocates page table till PMD level
only for 8M hugepage backed area just like THP. With this patch
we would have just one PMD to change so above race cannot happen,
and would save a lot of memory.

Thanks,
Nitin


> 	if (!fullmm)
> 		tlb_batch_add_one(mm, vaddr, pte_exec(orig), huge);
> 
> and tlb_batch_add_one() takes 'huge' and uses it to drive the flushing.
> 
> For a synchronous flush, we pass it down to flush_tsb_user_page().
> 
> For a batched flush we store it in tlb_batch, and any time 'huge'
> changes, we do a flush_tlb_pending(), just the same as if tb->tlb_nr
> hit TLB_BATCH_NR.
> 
> Then flush_tsb_user() uses 'tb->huge' to decide whether to flush
> MM_TSB_BASE or not.
> --
> To unsubscribe from this list: send the line "unsubscribe sparclinux" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/sparc/include/asm/tlbflush_64.h b/arch/sparc/include/asm/tlbflush_64.h
index dea1cfa..5c28f78 100644
--- a/arch/sparc/include/asm/tlbflush_64.h
+++ b/arch/sparc/include/asm/tlbflush_64.h
@@ -16,7 +16,8 @@  struct tlb_batch {
 
 void flush_tsb_kernel_range(unsigned long start, unsigned long end);
 void flush_tsb_user(struct tlb_batch *tb);
-void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr);
+void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr,
+			 bool is_huge);
 
 /* TLB flush operations. */
 
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 131eaf4..d8f625c 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -202,10 +202,15 @@  pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 	addr &= HPAGE_MASK;
 
 	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-		pte_clear(mm, addr, ptep);
+		*ptep = __pte(0UL);
 		addr += PAGE_SIZE;
 		ptep++;
 	}
+	/* Issue TSB and TLB flush at REAL_HPAGE_SIZE boundaries */
+	flush_tsb_user_page(mm, addr, true);
+	flush_tsb_user_page(mm, addr + REAL_HPAGE_SIZE, true);
+	global_flush_tlb_page(mm, addr);
+	global_flush_tlb_page(mm, addr + REAL_HPAGE_SIZE);
 
 	return entry;
 }
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index 9df2190..1a5de57 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -84,7 +84,7 @@  static void tlb_batch_add_one(struct mm_struct *mm, unsigned long vaddr,
 	}
 
 	if (!tb->active) {
-		flush_tsb_user_page(mm, vaddr);
+		flush_tsb_user_page(mm, vaddr, false);
 		global_flush_tlb_page(mm, vaddr);
 		goto out;
 	}
diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c
index a065766..3af2848 100644
--- a/arch/sparc/mm/tsb.c
+++ b/arch/sparc/mm/tsb.c
@@ -94,18 +94,25 @@  void flush_tsb_user(struct tlb_batch *tb)
 	spin_unlock_irqrestore(&mm->context.lock, flags);
 }
 
-void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr)
+/*
+ * @is_huge is true if the page containing @vaddr is guaranteed to
+ * be a huge page. If false, then TSBs for both, base and huge page
+ * size are flushed.
+ */
+void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr,
+			 bool is_huge)
 {
 	unsigned long nentries, base, flags;
 
 	spin_lock_irqsave(&mm->context.lock, flags);
 
-	base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb;
-	nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries;
-	if (tlb_type == cheetah_plus || tlb_type == hypervisor)
-		base = __pa(base);
-	__flush_tsb_one_entry(base, vaddr, PAGE_SHIFT, nentries);
-
+	if (!is_huge) {
+		base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb;
+		nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries;
+		if (tlb_type == cheetah_plus || tlb_type == hypervisor)
+			base = __pa(base);
+		__flush_tsb_one_entry(base, vaddr, PAGE_SHIFT, nentries);
+	}
 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 	if (mm->context.tsb_block[MM_TSB_HUGE].tsb) {
 		base = (unsigned long) mm->context.tsb_block[MM_TSB_HUGE].tsb;